In [ ]:
from dask import dataframe as dd
In [ ]:
%%time
ddf = dd.read_csv('sample_data.csv')
In [ ]:
%%time
df = dd.read_parquet('sample_data_parquet')
In [ ]:
ddf.head()
In [ ]:
type(ddf)
In [ ]:
ddf
In [ ]:
ddf.npartitions
In [ ]:
%%time
ddf['mean'] = ddf.mean(axis=1)
In [ ]:
def computation_within_partitions(df, in_col):
df[f'{in_col}_STD'] = df[in_col].mean()
return df
out_meta = ddf.dtypes.to_dict()
for col in ddf.columns.tolist():
if len(col) == 1:
out_meta.update({f"{col}_STD":float})
ddf = ddf.map_partitions(
computation_within_partitions,
in_col=col,
meta=out_meta
)
In [ ]:
%%time
df = ddf.compute()
In [ ]:
type(df)
In [ ]:
%%time
mini_df = ddf[['A','A_STD']].compute()
In [ ]: