notebook.community

Edit and run



In [1]:

    
pip show pandas dask









    



Name: pandas
Version: 1.0.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages
Requires: numpy, python-dateutil, pytz
Required-by: 
---
Name: dask
Version: 2.12.0
Summary: Parallel PyData with Task Scheduling
Home-page: https://github.com/dask/dask/
Author: None
Author-email: None
License: BSD
Location: /Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.



In [2]:

    
import numpy as np
import pandas as pd
from dask import dataframe as dd



In [3]:

    
df = pd.DataFrame(
    np.random.randint(0,100,size=(9000000,26)),
    columns=list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
)
ddf = dd.from_pandas(df, 10)



In [5]:

    
ddf['useless_letter'] = ddf['A'].apply(lambda x: np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')))
ddf['alphabet'] = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
out_columns = ddf.columns.tolist()
out_columns.reverse()
ddf = ddf[out_columns]









    



/Users/NothingToLose/clepy/myvenv/lib/python3.8/site-packages/dask/dataframe/core.py:3073: UserWarning: 
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('A', 'object'))

  warnings.warn(meta_warning(meta))



In [6]:

    
ddf.head()









    Out[6]:







  
    
      
      alphabet
      useless_letter
      Z
      Y
      X
      W
      V
      U
      T
      S
      ...
      J
      I
      H
      G
      F
      E
      D
      C
      B
      A
    
  
  
    
      0
      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      W
      1
      66
      69
      4
      70
      64
      83
      55
      ...
      36
      28
      33
      57
      23
      86
      48
      30
      91
      84
    
    
      1
      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      J
      25
      26
      36
      34
      0
      75
      60
      73
      ...
      42
      91
      8
      24
      64
      13
      43
      47
      94
      11
    
    
      2
      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      D
      40
      67
      36
      54
      46
      5
      57
      50
      ...
      82
      76
      24
      60
      3
      55
      64
      28
      26
      89
    
    
      3
      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      D
      91
      98
      8
      36
      17
      3
      29
      90
      ...
      88
      24
      7
      51
      52
      87
      1
      6
      19
      48
    
    
      4
      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      H
      87
      68
      73
      78
      39
      67
      57
      24
      ...
      9
      14
      0
      2
      51
      18
      95
      71
      28
      13
    
  

5 rows × 28 columns



In [8]:

    
%%time
ddf.to_csv('sample_data.csv', single_file=True)









    Out[8]:





['/Users/NothingToLose/clepy/Dask/sample_data.csv']



In [9]:

    
%%time
ddf.to_parquet('sample_data_parquet')



In [4]:



In [ ]:



In [ ]:

	alphabet	useless_letter	Z	Y	X	W	V	U	T	S	...	J	I	H	G	F	E	D	C	B	A
0	ABCDEFGHIJKLMNOPQRSTUVWXYZ	W	1	66	69	4	70	64	83	55	...	36	28	33	57	23	86	48	30	91	84
1	ABCDEFGHIJKLMNOPQRSTUVWXYZ	J	25	26	36	34	0	75	60	73	...	42	91	8	24	64	13	43	47	94	11
2	ABCDEFGHIJKLMNOPQRSTUVWXYZ	D	40	67	36	54	46	5	57	50	...	82	76	24	60	3	55	64	28	26	89
3	ABCDEFGHIJKLMNOPQRSTUVWXYZ	D	91	98	8	36	17	3	29	90	...	88	24	7	51	52	87	1	6	19	48
4	ABCDEFGHIJKLMNOPQRSTUVWXYZ	H	87	68	73	78	39	67	57	24	...	9	14	0	2	51	18	95	71	28	13