Import Statement



In [ ]:

    
from dask import dataframe as dd

Basics



In [ ]:

    
%%time
ddf = dd.read_csv('sample_data.csv')

Bonus: Parquet



In [ ]:

    
%%time
df = dd.read_parquet('sample_data_parquet')



In [ ]:

    
ddf.head()



In [ ]:

    
type(ddf)



In [ ]:

    
ddf



In [ ]:

    
ddf.npartitions

Computation



In [ ]:

    
%%time
ddf['mean'] = ddf.mean(axis=1)

Best Practices

In general, full data shuffle is expensive. Avoid repetition of set_index and keep all sorting operations within partitions.



In [ ]:

    
def computation_within_partitions(df, in_col):
    df[f'{in_col}_STD'] = df[in_col].mean()
    return df

out_meta = ddf.dtypes.to_dict()

for col in ddf.columns.tolist():
    if len(col) == 1:
        out_meta.update({f"{col}_STD":float})
        ddf = ddf.map_partitions(
            computation_within_partitions,
            in_col=col,
            meta=out_meta
        )



In [ ]:

    
%%time
df = ddf.compute()



In [ ]:

    
type(df)



In [ ]:

    
%%time
mini_df = ddf[['A','A_STD']].compute()



In [ ]: