In [ ]:
import pandas as pd
import numpy as np
import featuretools as ft
%load_ext autoreload
%autoreload 2
num_rows = 10000
int_cols = 10
float_cols = 10
id_columns = 4
num_groups = 1000
In [ ]:
try:
es = ft.read_entityset("groupby_benchmark/")
except:
print("creating entityset")
int_df = pd.DataFrame(np.random.randint(0,100,size=(num_rows, int_cols))).add_suffix("_int")
float_df = pd.DataFrame(np.random.randint(0,100000,size=(num_rows, float_cols)) / 100).add_suffix("_float")
id_df = pd.DataFrame(np.random.randint(0, num_groups,size=(num_rows, id_columns)), dtype=str).add_suffix("_id")
df = pd.concat([int_df, float_df, id_df], axis=1)
variable_types = {}
for col in id_df.columns:
variable_types[col] = ft.variable_types.Id
es = ft.EntitySet()
es.entity_from_dataframe(entity_id="entity",
dataframe=df,
index="index",
variable_types=variable_types,
make_index=True)
es.to_csv("groupby_benchmark", compression="gzip")
es
In [ ]:
fl = ft.dfs(target_entity="entity",
entityset=es,
groupby_trans_primitives=["cum_sum", "cum_max", "cum_min"],
max_depth=1,
features_only=True,
verbose=True)
len(fl)
In [ ]:
fm = ft.calculate_feature_matrix(entityset=es,
features=fl,
chunk_size=num_rows,
verbose=True)
fm.head()
In [ ]:
TEST_NAME = "by_features_2.csv"
fm.to_csv(TEST_NAME)
In [ ]:
# check that it gives same results
truth = pd.read_csv("master.csv")
test = pd.read_csv(TEST_NAME)[truth.columns]
truth.equals(test)