In [23]:
import os
from datetime import datetime
from glob import glob
import numpy as np
import pandas as pd
import featuretools as ft
from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *
In [5]:
pbar = ProgressBar()
pbar.register()
In [3]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
output_dir = "../data/partitioned"
def partition_by(df, column, output_dir):
directory = f"{output_dir}/{column}"
if not os.path.exists(directory):
os.makedirs(directory)
df.groupby(column).apply(lambda x: x.to_csv(f"{directory}/train_{x.name}.csv", index=False))
partition_by(pd.read_csv(input_file), 'app', output_dir)
In [6]:
input_path = '../data/partitioned/app'
dtypes = {
'ip': 'uint32',
'app': 'uint16',
'device': 'uint16',
'os': 'uint16',
'channel': 'uint16',
'is_attributed': 'uint8'
}
to_read = ['app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
In [7]:
filenames = glob(f"{input_path}/train_*.csv")
In [8]:
def createEntitySet(filename):
df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df['id'] = range(len(df))
es = ft.EntitySet(id='clicks')
es = es.entity_from_dataframe(
entity_id='clicks',
dataframe=df,
index='id',
time_index='click_time',
variable_types={
'app': ft.variable_types.Categorical,
'device': ft.variable_types.Categorical,
'os': ft.variable_types.Categorical,
'channel': ft.variable_types.Categorical,
'is_attributed': ft.variable_types.Boolean,
}
)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)
es.add_last_time_indexes()
return es
In [9]:
b = bag.from_sequence(filenames)
entity_sets = b.map(createEntitySet)
In [11]:
def calc_feature_matrix(es, entity_id, cutoff_time):
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_entity=entity_id,
cutoff_time=cutoff_time,
training_window=ft.Timedelta("3 days"),
max_depth=3
)
return feature_matrix, feature_defs
In [26]:
# For the sake of simplicity we take predefined time
cutoff_time = datetime.datetime(2017, 11, 9, 15, 59, 51)
Out[26]:
In [29]:
feature_matrices = entity_sets.map(calc_feature_matrix, entity_id='apps', cutoff_time=cutoff_time)
In [30]:
out = feature_matrices.compute()
_, feature_defs = out[0]
feature_matrices = list(map(list, zip(*out)))[0]
feature_matrix = pd.concat(feature_matrices)
In [31]:
feature_defs
Out[31]:
In [32]:
feature_matrix
Out[32]:
In [ ]: