This part looks at what dask DataFrame actually are, a lazily evaluated dependency graph, and how these get executed
In [1]:
# Complete set of Python 3.6 imports used for these examples
# Standard modules
import io
import logging
import lzma
import multiprocessing
import os
import ssl
import sys
import time
import urllib.request
import zipfile
# Third-party modules
import fastparquet # Needs python-snappy and llvmlite
import graphviz # To visualize Dask graphs
import numpy as np
import pandas as pd
import psutil # Memory stats
import dask
import dask.dataframe as dd
import bokeh.io # For Dask profile graphs
import seaborn as sns # For colormaps
# Support multiple lines of output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Don't wrap tables
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.width = 300
# Show matplotlib and bokeh graphs inline in Jupyter notebook
%matplotlib inline
bokeh.io.output_notebook()
print(sys.version)
np.__version__, pd.__version__, dask.__version__
Out[1]:
In [97]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]], columns=['a','b','c'])
print(df)
In [131]:
ddf = dd.from_pandas(df, npartitions=1)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
Out[131]:
Out[131]:
Out[131]:
Out[131]:
In [133]:
ddf = dd.from_pandas(df, npartitions=2)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
Out[133]:
Out[133]:
Out[133]:
Out[133]:
In [144]:
ddf = dd.from_pandas(df, npartitions=2).head(n=2, npartitions=2, compute=False)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
Out[144]:
Out[144]:
Out[144]:
Out[144]:
In [146]:
ddf._keys()
Out[146]:
In [139]:
ddf.compute()
Out[139]:
In [123]:
ddf2 = ddf1.head(1,compute=False)
ddf2.visualize()
ddf2._name
ddf2.dask
Out[123]:
Out[123]:
Out[123]:
In [140]:
ddf2._keys()
Out[140]:
In [141]:
ddf2._keys??
In [94]:
ddf = dd.from_pandas(df, chunksize=3)
print(ddf)
In [96]:
print(ddf._meta)
ddf.npartitions
ddf.divisions
ddf.visualize()
Out[96]:
Out[96]:
Out[96]:
In [98]:
ddf.dask
Out[98]:
In [99]:
for k, v in ddf.dask.items():
print(repr(k))
print(' ' + repr(v))
In [103]:
(ddf+1).sum()
Out[103]:
In [110]:
task = ddf.head(n=2, compute=False)
task.dask
task.visualize()
Out[110]:
Out[110]:
In [108]:
ddf.head(2, npartitions=-1, compute=False).visualize()
Out[108]:
In [102]:
(ddf + 1).sum()visualize()
Out[102]:
In [ ]:
In [ ]:
In [ ]:
In [87]:
df.head(n=2)
Out[87]:
In [143]:
task = ddf.head(n=2, npartitions=2, compute=False)
In [91]:
task.visualize()
Out[91]:
In [92]:
task.dask
Out[92]:
In [142]:
task._keys()
Out[142]:
In [ ]:
In [ ]:
In [ ]:
In [65]:
print(pd.DataFrame.__doc__)
In [66]:
print(dd.DataFrame.__doc__)
In [ ]:
dd.from_pandas()
In [ ]:
In [52]:
ddf = dd.from_pandas(df, chunksize=2)
task = ddf[ddf.a>2]
In [53]:
task.compute()
Out[53]:
In [54]:
task.visualize()
Out[54]:
In [55]:
print(dd.DataFrame.__doc__)
In [56]:
task._meta
Out[56]:
In [57]:
task.npartitions
task.divisions
Out[57]:
Out[57]:
In [58]:
task._name
Out[58]:
In [59]:
task.dask
Out[59]:
In [60]:
task.dask[(task._name,0)]
Out[60]:
In [61]:
task.dask[(task._name,1)]
Out[61]:
In [62]:
task.compute??
In [48]:
task2.compute()
Out[48]:
In [49]:
task2.visualize()
Out[49]:
In [51]:
task2.dask[(task2._name,0)]
Out[51]:
In [ ]: