In [ ]:
import pandas as pd
import numpy as np
import featuretools as ft
%load_ext autoreload
%autoreload 2

num_rows = 10000
int_cols = 10
float_cols = 10
id_columns = 4
num_groups = 1000

In [ ]:
try:
    es = ft.read_entityset("groupby_benchmark/")
except:
    print("creating entityset")
    int_df = pd.DataFrame(np.random.randint(0,100,size=(num_rows, int_cols))).add_suffix("_int")
    float_df = pd.DataFrame(np.random.randint(0,100000,size=(num_rows, float_cols)) / 100).add_suffix("_float")
    id_df = pd.DataFrame(np.random.randint(0, num_groups,size=(num_rows, id_columns)), dtype=str).add_suffix("_id")
    df = pd.concat([int_df, float_df, id_df], axis=1)


    variable_types = {}

    for col in id_df.columns:
        variable_types[col] = ft.variable_types.Id

    es = ft.EntitySet()
    es.entity_from_dataframe(entity_id="entity",
                             dataframe=df,
                             index="index",
                             variable_types=variable_types,
                             make_index=True)
    es.to_csv("groupby_benchmark", compression="gzip")
    
es

In [ ]:
fl = ft.dfs(target_entity="entity",
       entityset=es,
       groupby_trans_primitives=["cum_sum", "cum_max", "cum_min"],
       max_depth=1,
       features_only=True,
       verbose=True)

len(fl)

In [ ]:
fm = ft.calculate_feature_matrix(entityset=es,
                                 features=fl,
                                 chunk_size=num_rows,
                                 verbose=True)
fm.head()

In [ ]:
TEST_NAME = "by_features_2.csv"
fm.to_csv(TEST_NAME)

In [ ]:
# check that it gives same results
truth = pd.read_csv("master.csv")
test = pd.read_csv(TEST_NAME)[truth.columns]

truth.equals(test)