Import Statement


In [ ]:
from dask import dataframe as dd

Basics


In [ ]:
%%time
ddf = dd.read_csv('sample_data.csv')

Bonus: Parquet


In [ ]:
%%time
df = dd.read_parquet('sample_data_parquet')

In [ ]:
ddf.head()

In [ ]:
type(ddf)

In [ ]:
ddf

In [ ]:
ddf.npartitions

Computation


In [ ]:
%%time
ddf['mean'] = ddf.mean(axis=1)

Best Practices

In general, full data shuffle is expensive. Avoid repetition of set_index and keep all sorting operations within partitions.


In [ ]:
def computation_within_partitions(df, in_col):
    df[f'{in_col}_STD'] = df[in_col].mean()
    return df

out_meta = ddf.dtypes.to_dict()

for col in ddf.columns.tolist():
    if len(col) == 1:
        out_meta.update({f"{col}_STD":float})
        ddf = ddf.map_partitions(
            computation_within_partitions,
            in_col=col,
            meta=out_meta
        )

In [ ]:
%%time
df = ddf.compute()

In [ ]:
type(df)

In [ ]:
%%time
mini_df = ddf[['A','A_STD']].compute()

In [ ]: