In [1]:
pip show pandas dask


Name: pandas
Version: 1.0.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages
Requires: numpy, python-dateutil, pytz
Required-by: 
---
Name: dask
Version: 2.12.0
Summary: Parallel PyData with Task Scheduling
Home-page: https://github.com/dask/dask/
Author: None
Author-email: None
License: BSD
Location: /Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.

In [2]:
import numpy as np
import pandas as pd
from dask import dataframe as dd

In [3]:
df = pd.DataFrame(
    np.random.randint(0,100,size=(9000000,26)),
    columns=list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
)
ddf = dd.from_pandas(df, 10)

In [5]:
ddf['useless_letter'] = ddf['A'].apply(lambda x: np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')))
ddf['alphabet'] = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
out_columns = ddf.columns.tolist()
out_columns.reverse()
ddf = ddf[out_columns]


/Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages/dask/dataframe/core.py:3073: UserWarning: 
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('A', 'object'))

  warnings.warn(meta_warning(meta))

In [6]:
ddf.head()


Out[6]:
alphabet useless_letter Z Y X W V U T S ... J I H G F E D C B A
0 ABCDEFGHIJKLMNOPQRSTUVWXYZ W 1 66 69 4 70 64 83 55 ... 36 28 33 57 23 86 48 30 91 84
1 ABCDEFGHIJKLMNOPQRSTUVWXYZ J 25 26 36 34 0 75 60 73 ... 42 91 8 24 64 13 43 47 94 11
2 ABCDEFGHIJKLMNOPQRSTUVWXYZ D 40 67 36 54 46 5 57 50 ... 82 76 24 60 3 55 64 28 26 89
3 ABCDEFGHIJKLMNOPQRSTUVWXYZ D 91 98 8 36 17 3 29 90 ... 88 24 7 51 52 87 1 6 19 48
4 ABCDEFGHIJKLMNOPQRSTUVWXYZ H 87 68 73 78 39 67 57 24 ... 9 14 0 2 51 18 95 71 28 13

5 rows × 28 columns


In [8]:
%%time
ddf.to_csv('sample_data.csv', single_file=True)


Out[8]:
['/Users/NothingToLose/clepy/Dask/sample_data.csv']

In [9]:
%%time
ddf.to_parquet('sample_data_parquet')

In [4]:


In [ ]:


In [ ]: