In [54]:
from IPython.display import SVG
In [56]:
Image(url= "../img/max-order-size.svg", width=600, height=600)
Out[56]:
In [17]:
import numpy as np
import pandas as pd
import featuretools as ft
from featuretools.primitives import *
In [20]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv(input_file).head()
Out[20]:
In [21]:
dtypes = {
'ip': 'uint32',
'app': 'uint16',
'device': 'uint16',
'os': 'uint16',
'channel': 'uint16',
'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train['id'] = range(len(df_train))
In [24]:
es = ft.EntitySet(id='clicks')
In [26]:
es = es.entity_from_dataframe(
entity_id='clicks',
dataframe=df_train,
index='id',
time_index='click_time',
variable_types={
'ip': ft.variable_types.Categorical,
'app': ft.variable_types.Categorical,
'device': ft.variable_types.Categorical,
'os': ft.variable_types.Categorical,
'channel': ft.variable_types.Categorical,
'is_attributed': ft.variable_types.Boolean,
}
)
In [28]:
es
Out[28]:
In [27]:
es["clicks"].variables
Out[27]:
In [35]:
'''
es = es.entity_from_dataframe(
entity_id='users',
dataframe=df_users,
index='id')
new_relationship = ft.Relationship(
es["users"]["id"],
es["clicks"]["user_id"])
es = es.add_relationship(new_relationship)
'''
Out[35]:
In [29]:
es = es.normalize_entity(
base_entity_id='clicks',
new_entity_id='apps',
index='app',
make_time_index=False)
In [30]:
es['clicks'].variables
Out[30]:
In [31]:
es['apps'].variables
Out[31]:
In [32]:
es
Out[32]:
In [44]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="apps", max_depth=3)
In [45]:
feature_defs
Out[45]:
In [43]:
feature_matrix.head()
Out[43]:
Aggregation primitives: These primitives take related instances as an input and output a single value. They are applied across a parent-child relationship in an entity set. E.g: Count
, Sum
, AvgTimeBetween
.
Transform primitives: These primitives take one or more variables from an entity as an input and output a new variable for that entity. They are applied to a single entity. E.g: Hour
, TimeSincePrevious
, Absolute
.
Custom primitives: You can define your own aggregation and transform primitives
In [61]:
class Sum(AggregationPrimitive):
"""Counts the number of elements of a numeric or boolean feature"""
name = "sum"
input_types = [Numeric]
return_type = Numeric
stack_on_self = False
stack_on_exclude = [Count]
# todo: handle count nulls
def get_function(self):
def sum_func(x):
return np.nan_to_num(x.values).sum(dtype=np.float)
return sum_func
In [ ]:
class Day(DatetimeUnitBasePrimitive):
"""Transform a Datetime feature into the day"""
name = "day"
class DatetimeUnitBasePrimitive(TransformPrimitive):
"""Transform Datetime feature into time or calendar units
(second/day/week/etc)"""
name = None
input_types = [Datetime]
return_type = Ordinal
def get_function(self):
return lambda array: pd_time_unit(self.name)(pd.DatetimeIndex(array))
In [ ]: