In [5]:
import numpy as np
import pandas as pd
import featuretools as ft
from featuretools.primitives import *
In [6]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv(input_file).head()
Out[6]:
ip
app
device
os
channel
click_time
attributed_time
is_attributed
0
87540
12
1
13
497
2017-11-07 09:30:38
NaN
0
1
105560
25
1
17
259
2017-11-07 13:40:27
NaN
0
2
101424
12
1
19
212
2017-11-07 18:05:24
NaN
0
3
94584
13
1
13
477
2017-11-07 04:58:08
NaN
0
4
68413
12
1
1
178
2017-11-09 09:00:09
NaN
0
In [7]:
dtypes = {
'ip': 'uint32',
'app': 'uint16',
'device': 'uint16',
'os': 'uint16',
'channel': 'uint16',
'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
In [33]:
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train.sort_values(by='click_time')
df_train['id'] = range(len(df_train))
df_train.head()
Out[33]:
ip
app
device
os
channel
click_time
is_attributed
id
0
87540
12
1
13
497
2017-11-07 09:30:38
0
0
1
105560
25
1
17
259
2017-11-07 13:40:27
0
1
2
101424
12
1
19
212
2017-11-07 18:05:24
0
2
3
94584
13
1
13
477
2017-11-07 04:58:08
0
3
4
68413
12
1
1
178
2017-11-09 09:00:09
0
4
In [34]:
es = ft.EntitySet(id='clicks')
es = es.entity_from_dataframe(
entity_id='clicks',
dataframe=df_train,
index='id',
time_index='click_time',
variable_types={
'ip': ft.variable_types.Categorical,
'app': ft.variable_types.Categorical,
'device': ft.variable_types.Categorical,
'os': ft.variable_types.Categorical,
'channel': ft.variable_types.Categorical,
'is_attributed': ft.variable_types.Boolean,
}
)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)
In [35]:
es
Out[35]:
Entityset: clicks
Entities:
clicks (shape = [100000, 8])
apps (shape = [161, 1])
Relationships:
clicks.app -> apps.app
In [36]:
# Hint Featuretools: creates features for the particular value
es['clicks']['is_attributed'].interesting_values = [True]
In [45]:
# Manually defined domain specific feature: "Seed Features"
google_apps = ft.Feature(es['apps']['app']).isin([1,2,3,4,5])
In [56]:
X = df_train.copy()
X_features = X[X['id'] < 70000]
X_train = X[X['id'] >= 70000]
cutoff_time = X_features['click_time'].max()
cutoff_time
Out[56]:
Timestamp('2017-11-09 15:59:51')
In [63]:
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_entity='apps',
seed_features=[google_apps],
cutoff_time=cutoff_time,
training_window=ft.Timedelta("3 days"),
max_depth=5
)
2018-04-02 17:55:43,927 featuretools.entityset - WARNING Using training_window but last_time_index is not set on entity clicks
In [64]:
feature_defs
Out[64]:
[<Feature: app.isin([1, 2, 3, 4, 5])>,
<Feature: COUNT(clicks)>,
<Feature: PERCENT_TRUE(clicks.is_attributed)>,
<Feature: NUM_UNIQUE(clicks.ip)>,
<Feature: NUM_UNIQUE(clicks.device)>,
<Feature: NUM_UNIQUE(clicks.os)>,
<Feature: NUM_UNIQUE(clicks.channel)>,
<Feature: MODE(clicks.ip)>,
<Feature: MODE(clicks.device)>,
<Feature: MODE(clicks.os)>,
<Feature: MODE(clicks.channel)>,
<Feature: COUNT(clicks WHERE is_attributed = True)>,
<Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
<Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
<Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
<Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
<Feature: MODE(clicks.DAY(click_time))>,
<Feature: MODE(clicks.YEAR(click_time))>,
<Feature: MODE(clicks.MONTH(click_time))>,
<Feature: MODE(clicks.WEEKDAY(click_time))>]
In [62]:
feature_matrix
Out[62]:
app.isin([1, 2, 3, 4, 5])
COUNT(clicks)
PERCENT_TRUE(clicks.is_attributed)
NUM_UNIQUE(clicks.ip)
NUM_UNIQUE(clicks.device)
NUM_UNIQUE(clicks.os)
NUM_UNIQUE(clicks.channel)
MODE(clicks.ip)
MODE(clicks.device)
MODE(clicks.os)
MODE(clicks.channel)
COUNT(clicks WHERE is_attributed = True)
NUM_UNIQUE(clicks.DAY(click_time))
NUM_UNIQUE(clicks.YEAR(click_time))
NUM_UNIQUE(clicks.MONTH(click_time))
NUM_UNIQUE(clicks.WEEKDAY(click_time))
MODE(clicks.DAY(click_time))
MODE(clicks.YEAR(click_time))
MODE(clicks.MONTH(click_time))
MODE(clicks.WEEKDAY(click_time))
app
1
True
3135
0.000000
2723
4
56
27
5348
1
19
134
0.0
4
1
1
4
7
2017
11
1
2
True
11737
0.000000
7759
3
75
21
5314
1
19
477
0.0
4
1
1
4
8
2017
11
2
3
True
18279
0.000219
12040
4
78
32
5348
1
19
280
4.0
4
1
1
4
8
2017
11
2
4
True
58
0.000000
56
2
20
1
79881
1
19
101
0.0
3
1
1
3
9
2017
11
3
5
True
188
0.074468
187
2
36
4
26995
1
19
377
14.0
4
1
1
4
7
2017
11
1
6
False
1303
0.000000
1209
2
47
4
5314
1
19
459
0.0
4
1
1
4
7
2017
11
1
7
False
981
0.000000
790
4
56
1
48240
1
13
101
0.0
3
1
1
3
9
2017
11
3
8
False
2004
0.001996
1778
3
51
3
5348
1
19
145
4.0
4
1
1
4
9
2017
11
3
9
False
8992
0.000890
6721
5
73
29
5348
1
19
466
8.0
4
1
1
4
9
2017
11
3
10
False
388
0.046392
373
1
42
5
5348
1
19
377
18.0
4
1
1
4
7
2017
11
1
11
False
1927
0.001038
1742
2
53
12
5348
1
19
319
2.0
4
1
1
4
8
2017
11
2
12
False
13198
0.000076
9190
4
71
26
73487
1
19
178
1.0
4
1
1
4
8
2017
11
2
13
False
2422
0.000000
2179
4
60
6
5314
1
19
477
0.0
4
1
1
4
7
2017
11
1
14
False
5359
0.000000
4442
3
63
28
5314
1
19
379
0.0
4
1
1
4
8
2017
11
2
15
False
8595
0.000233
6473
2
67
24
5348
1
19
245
2.0
4
1
1
4
7
2017
11
1
16
False
3
0.000000
3
1
2
1
18966
1
18
268
0.0
2
1
1
2
7
2017
11
1
17
False
380
0.000000
345
2
40
4
17149
1
19
280
0.0
4
1
1
4
8
2017
11
2
18
False
8315
0.000601
6441
5
69
12
5348
1
19
107
5.0
4
1
1
4
8
2017
11
2
19
False
478
0.146444
448
76
14
8
5348
0
24
213
70.0
4
1
1
4
9
2017
11
3
20
False
911
0.001098
856
2
49
4
5314
1
19
259
1.0
4
1
1
4
7
2017
11
1
21
False
1979
0.000000
1732
2
57
3
5314
1
19
128
0.0
4
1
1
4
7
2017
11
1
22
False
386
0.000000
373
2
37
2
5314
1
19
116
0.0
4
1
1
4
7
2017
11
1
23
False
1454
0.000000
1351
2
46
3
5314
1
19
153
0.0
4
1
1
4
8
2017
11
2
24
False
704
0.000000
645
2
52
2
5348
1
19
105
0.0
4
1
1
4
7
2017
11
1
25
False
804
0.000000
740
2
41
1
5348
1
13
259
0.0
4
1
1
4
7
2017
11
1
26
False
1633
0.000000
1511
2
53
4
5348
1
19
121
0.0
4
1
1
4
8
2017
11
2
27
False
696
0.000000
662
2
43
2
5314
1
19
153
0.0
4
1
1
4
7
2017
11
1
28
False
720
0.000000
691
2
42
2
5348
1
19
135
0.0
4
1
1
4
7
2017
11
1
29
False
360
0.061111
329
3
43
8
5348
1
19
343
22.0
4
1
1
4
7
2017
11
1
30
False
2
0.000000
2
1
1
1
81643
3866
866
347
0.0
2
1
1
2
8
2017
11
2
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
202
False
6
0.166667
6
1
5
1
31277
1
6
421
1.0
3
1
1
3
7
2017
11
1
204
False
2
0.000000
2
2
2
1
37424
3032
607
347
0.0
2
1
1
2
7
2017
11
1
208
False
13
0.076923
13
1
5
2
7308
1
19
330
1.0
2
1
1
2
7
2017
11
1
215
False
4
0.000000
3
2
2
1
269087
3543
748
347
0.0
2
1
1
2
8
2017
11
2
216
False
1
0.000000
1
1
1
1
217078
1
22
213
0.0
1
1
1
1
8
2017
11
2
232
False
9
0.000000
9
3
3
1
32356
3866
866
347
0.0
3
1
1
3
8
2017
11
2
233
False
1
0.000000
1
1
1
1
27629
3032
607
347
0.0
1
1
1
1
7
2017
11
1
261
False
1
1.000000
1
1
1
1
288079
0
0
243
1.0
1
1
1
1
9
2017
11
3
266
False
2
0.000000
2
2
2
1
26622
3032
607
347
0.0
2
1
1
2
6
2017
11
0
267
False
1
0.000000
1
1
1
1
99154
3543
748
347
0.0
1
1
1
1
8
2017
11
2
268
False
1
0.000000
1
1
1
1
43827
3032
607
347
0.0
1
1
1
1
6
2017
11
0
271
False
1
0.000000
1
1
1
1
67316
3543
748
347
0.0
1
1
1
1
8
2017
11
2
273
False
3
0.000000
2
1
2
1
201182
0
0
347
0.0
2
1
1
2
7
2017
11
1
293
False
1
0.000000
1
1
1
1
59384
0
0
347
0.0
1
1
1
1
9
2017
11
3
302
False
1
0.000000
1
1
1
1
44725
3543
748
347
0.0
1
1
1
1
8
2017
11
2
310
False
3
0.000000
3
1
3
1
5348
0
0
272
0.0
1
1
1
1
9
2017
11
3
315
False
4
0.000000
4
1
4
1
14884
1
13
110
0.0
3
1
1
3
7
2017
11
1
347
False
1
0.000000
1
1
1
1
245890
3543
748
347
0.0
1
1
1
1
8
2017
11
2
363
False
2
0.000000
2
1
1
1
41463
3543
748
347
0.0
2
1
1
2
7
2017
11
1
372
False
1
0.000000
1
1
1
1
125141
1
18
107
0.0
1
1
1
1
7
2017
11
1
394
False
2
0.000000
2
2
2
1
105475
3032
607
347
0.0
2
1
1
2
7
2017
11
1
398
False
1
0.000000
1
1
1
1
49462
1
36
347
0.0
1
1
1
1
7
2017
11
1
407
False
1
0.000000
1
1
1
1
44527
3543
748
347
0.0
1
1
1
1
8
2017
11
2
425
False
2
0.000000
2
2
2
1
26995
3032
607
347
0.0
2
1
1
2
7
2017
11
1
474
False
1
0.000000
1
1
1
1
114220
0
38
272
0.0
1
1
1
1
9
2017
11
3
486
False
1
0.000000
1
1
1
1
245311
0
24
21
0.0
1
1
1
1
7
2017
11
1
536
False
1
0.000000
1
1
1
1
49856
1
19
21
0.0
1
1
1
1
6
2017
11
0
538
False
1
0.000000
1
1
1
1
4136
3032
607
347
0.0
1
1
1
1
7
2017
11
1
548
False
1
0.000000
1
1
1
1
191061
3543
748
347
0.0
1
1
1
1
8
2017
11
2
551
False
1
0.000000
1
1
1
1
67285
347
113
243
0.0
1
1
1
1
7
2017
11
1
161 rows × 20 columns
In [ ]:
Content source: alexandrnikitin/workshops
Similar notebooks: