Scaling Featuretools with Dask

  • https://dask.pydata.org/en/latest/
  • "Dask provides advanced parallelism for analytics, enabling performance at scale for the tools you love"
  • "Dask's schedulers scale to thousand-node clusters and its algorithms have been tested on some of the largest supercomputers in the world."
  • Works with NumPy, Pandas, Scikit-Learn. Mimic their APIs.

In [23]:
import os
from datetime import datetime
from glob import glob

import numpy as np
import pandas as pd
import featuretools as ft

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [5]:
pbar = ProgressBar()
pbar.register()

1. Partition data


In [3]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
output_dir = "../data/partitioned"

def partition_by(df, column, output_dir):
    directory = f"{output_dir}/{column}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.groupby(column).apply(lambda x: x.to_csv(f"{directory}/train_{x.name}.csv", index=False))

partition_by(pd.read_csv(input_file), 'app', output_dir)

2. Create distributed EntitySets


In [6]:
input_path = '../data/partitioned/app'

dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']

In [7]:
filenames = glob(f"{input_path}/train_*.csv")

In [8]:
def createEntitySet(filename):
    df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
    df['id'] = range(len(df))
    
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        
        variable_types={
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )

    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)
    es.add_last_time_indexes()
    return es

In [9]:
b = bag.from_sequence(filenames)
entity_sets = b.map(createEntitySet)

3. Calculate feature matrices and definitions


In [11]:
def calc_feature_matrix(es, entity_id, cutoff_time):
    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_entity=entity_id,
        cutoff_time=cutoff_time,
        training_window=ft.Timedelta("3 days"),
        max_depth=3
    )

    return feature_matrix, feature_defs

In [26]:
# For the sake of simplicity we take predefined time
cutoff_time = datetime.datetime(2017, 11, 9, 15, 59, 51)


Out[26]:
datetime.datetime(2017, 11, 9, 15, 59, 51)

In [29]:
feature_matrices = entity_sets.map(calc_feature_matrix, entity_id='apps', cutoff_time=cutoff_time)

4. Compute the distributed features


In [30]:
out = feature_matrices.compute()
_, feature_defs = out[0]
feature_matrices = list(map(list, zip(*out)))[0]
feature_matrix = pd.concat(feature_matrices)


[########################################] | 100% Completed |  4.4s
[########################################] | 100% Completed |  4.5s

In [31]:
feature_defs


Out[31]:
[<Feature: COUNT(clicks)>,
 <Feature: PERCENT_TRUE(clicks.is_attributed)>,
 <Feature: NUM_UNIQUE(clicks.device)>,
 <Feature: NUM_UNIQUE(clicks.os)>,
 <Feature: NUM_UNIQUE(clicks.channel)>,
 <Feature: MODE(clicks.device)>,
 <Feature: MODE(clicks.os)>,
 <Feature: MODE(clicks.channel)>,
 <Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
 <Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
 <Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
 <Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
 <Feature: MODE(clicks.DAY(click_time))>,
 <Feature: MODE(clicks.YEAR(click_time))>,
 <Feature: MODE(clicks.MONTH(click_time))>,
 <Feature: MODE(clicks.WEEKDAY(click_time))>]

In [32]:
feature_matrix


Out[32]:
COUNT(clicks) PERCENT_TRUE(clicks.is_attributed) NUM_UNIQUE(clicks.device) NUM_UNIQUE(clicks.os) NUM_UNIQUE(clicks.channel) MODE(clicks.device) MODE(clicks.os) MODE(clicks.channel) NUM_UNIQUE(clicks.DAY(click_time)) NUM_UNIQUE(clicks.YEAR(click_time)) NUM_UNIQUE(clicks.MONTH(click_time)) NUM_UNIQUE(clicks.WEEKDAY(click_time)) MODE(clicks.DAY(click_time)) MODE(clicks.YEAR(click_time)) MODE(clicks.MONTH(click_time)) MODE(clicks.WEEKDAY(click_time))
app
163 1 0.000000 1 1 1 0 0 4 1 1 1 1 9 2017 11 3
19 478 0.146444 76 14 8 0 24 213 4 1 1 4 9 2017 11 3
134 4 0.000000 3 3 1 3032 607 347 2 1 1 2 7 2017 11 1
538 1 0.000000 1 1 1 3032 607 347 1 1 1 1 7 2017 11 1
48 3 0.333333 1 2 1 1 13 213 1 1 1 1 7 2017 11 1
16 3 0.000000 1 2 1 1 18 268 2 1 1 2 7 2017 11 1
202 6 0.166667 1 5 1 1 6 421 3 1 1 3 7 2017 11 1
115 1 1.000000 1 1 1 1 22 203 1 1 1 1 9 2017 11 3
8 2004 0.001996 3 51 3 1 19 145 4 1 1 4 9 2017 11 3
32 286 0.003497 2 37 2 1 19 376 4 1 1 4 9 2017 11 3
192 1 0.000000 1 1 1 7 21 320 1 1 1 1 7 2017 11 1
66 9 0.000000 2 8 2 1 13 101 3 1 1 3 9 2017 11 3
36 102 0.000000 1 15 3 1 13 373 4 1 1 4 7 2017 11 1
21 1979 0.000000 2 57 3 1 19 128 4 1 1 4 7 2017 11 1
7 981 0.000000 4 56 1 1 13 101 3 1 1 3 9 2017 11 3
315 4 0.000000 1 4 1 1 13 110 3 1 1 3 7 2017 11 1
109 10 0.000000 1 4 1 0 59 347 3 1 1 3 8 2017 11 2
107 19 0.105263 2 7 1 1 19 171 4 1 1 4 8 2017 11 2
146 3 0.000000 2 2 1 3543 748 347 2 1 1 2 8 2017 11 2
310 3 0.000000 1 3 1 0 0 272 1 1 1 1 9 2017 11 3
150 73 0.000000 1 16 2 1 19 110 4 1 1 4 7 2017 11 1
121 3 0.000000 1 3 1 1 13 203 3 1 1 3 6 2017 11 0
112 1 0.000000 1 1 1 6 29 213 1 1 1 1 7 2017 11 1
94 40 0.000000 2 18 1 1 19 361 4 1 1 4 8 2017 11 2
6 1303 0.000000 2 47 4 1 19 459 4 1 1 4 7 2017 11 1
347 1 0.000000 1 1 1 3543 748 347 1 1 1 1 8 2017 11 2
293 1 0.000000 1 1 1 0 0 347 1 1 1 1 9 2017 11 3
43 10 0.000000 1 6 1 1 19 330 4 1 1 4 7 2017 11 1
100 1 0.000000 1 1 1 1 19 347 1 1 1 1 7 2017 11 1
71 2 1.000000 1 2 1 1 19 3 2 1 1 2 8 2017 11 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52 3 0.000000 1 2 1 1 19 379 3 1 1 3 7 2017 11 1
25 804 0.000000 2 41 1 1 13 259 4 1 1 4 7 2017 11 1
261 1 1.000000 1 1 1 0 0 243 1 1 1 1 9 2017 11 3
145 1 1.000000 1 1 1 50 0 320 1 1 1 1 9 2017 11 3
23 1454 0.000000 2 46 3 1 19 153 4 1 1 4 8 2017 11 2
548 1 0.000000 1 1 1 3543 748 347 1 1 1 1 8 2017 11 2
158 2 0.000000 1 2 1 182 116 455 1 1 1 1 8 2017 11 2
536 1 0.000000 1 1 1 1 19 21 1 1 1 1 6 2017 11 0
118 2 0.000000 1 2 1 1 19 452 2 1 1 2 6 2017 11 0
74 7 0.000000 1 6 1 1 13 21 4 1 1 4 7 2017 11 1
116 1 1.000000 1 1 1 1 35 101 1 1 1 1 7 2017 11 1
266 2 0.000000 2 2 1 3032 607 347 2 1 1 2 6 2017 11 0
61 3 0.000000 1 3 1 1 13 21 2 1 1 2 7 2017 11 1
273 3 0.000000 1 2 1 0 0 347 2 1 1 2 7 2017 11 1
551 1 0.000000 1 1 1 347 113 243 1 1 1 1 7 2017 11 1
60 21 0.047619 1 11 1 1 10 21 3 1 1 3 9 2017 11 3
45 36 0.305556 3 18 6 1 19 411 3 1 1 3 8 2017 11 2
56 50 0.000000 2 18 1 1 13 406 2 1 1 2 8 2017 11 2
82 21 0.000000 1 10 3 1 13 21 3 1 1 3 7 2017 11 1
35 49 0.551020 1 20 2 1 19 21 4 1 1 4 7 2017 11 1
124 1 0.000000 1 1 1 1 57 224 1 1 1 1 9 2017 11 3
46 14 0.000000 1 5 1 0 0 347 4 1 1 4 7 2017 11 1
372 1 0.000000 1 1 1 1 18 107 1 1 1 1 7 2017 11 1
215 4 0.000000 2 2 1 3543 748 347 2 1 1 2 8 2017 11 2
123 1 0.000000 1 1 1 0 24 213 1 1 1 1 7 2017 11 1
108 3 0.666667 1 2 1 1 19 243 2 1 1 2 8 2017 11 2
27 696 0.000000 2 43 2 1 19 153 4 1 1 4 7 2017 11 1
67 5 0.000000 2 4 1 1 607 347 2 1 1 2 7 2017 11 1
85 2 0.000000 1 1 1 3543 748 347 2 1 1 2 7 2017 11 1
4 58 0.000000 2 20 1 1 19 101 3 1 1 3 9 2017 11 3

161 rows × 16 columns


In [ ]: