This part looks at what dask DataFrame actually are, a lazily evaluated dependency graph, and how these get executed
In [1]:
    
# Complete set of Python 3.6 imports used for these examples
# Standard modules
import io
import logging
import lzma
import multiprocessing
import os
import ssl
import sys
import time
import urllib.request
import zipfile
# Third-party modules
import fastparquet      # Needs python-snappy and llvmlite
import graphviz         # To visualize Dask graphs 
import numpy as np
import pandas as pd
import psutil           # Memory stats
import dask
import dask.dataframe as dd
import bokeh.io         # For Dask profile graphs
import seaborn as sns   # For colormaps
# Support multiple lines of output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Don't wrap tables
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.width = 300
# Show matplotlib and bokeh graphs inline in Jupyter notebook
%matplotlib inline
bokeh.io.output_notebook()
print(sys.version)
np.__version__, pd.__version__, dask.__version__
    
    
    
    
    Out[1]:
In [97]:
    
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]], columns=['a','b','c'])
print(df)
    
    
In [131]:
    
ddf = dd.from_pandas(df, npartitions=1)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
    
    
    Out[131]:
    
    Out[131]:
    Out[131]:
    Out[131]:
In [133]:
    
ddf = dd.from_pandas(df, npartitions=2)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
    
    
    Out[133]:
    
    Out[133]:
    Out[133]:
    Out[133]:
In [144]:
    
ddf = dd.from_pandas(df, npartitions=2).head(n=2, npartitions=2, compute=False)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()
    
    
    Out[144]:
    
    Out[144]:
    Out[144]:
    Out[144]:
In [146]:
    
ddf._keys()
    
    Out[146]:
In [139]:
    
ddf.compute()
    
    Out[139]:
In [123]:
    
ddf2 = ddf1.head(1,compute=False)
ddf2.visualize()
ddf2._name
ddf2.dask
    
    Out[123]:
    Out[123]:
    Out[123]:
In [140]:
    
ddf2._keys()
    
    Out[140]:
In [141]:
    
ddf2._keys??
    
In [94]:
    
ddf = dd.from_pandas(df, chunksize=3)
print(ddf)
    
    
In [96]:
    
print(ddf._meta)
ddf.npartitions
ddf.divisions
ddf.visualize()
    
    
    Out[96]:
    Out[96]:
    Out[96]:
In [98]:
    
ddf.dask
    
    Out[98]:
In [99]:
    
for k, v in ddf.dask.items():
    print(repr(k))
    print('  ' + repr(v))
    
    
In [103]:
    
(ddf+1).sum()
    
    Out[103]:
In [110]:
    
task = ddf.head(n=2, compute=False)
task.dask
task.visualize()
    
    Out[110]:
    Out[110]:
In [108]:
    
ddf.head(2, npartitions=-1, compute=False).visualize()
    
    Out[108]:
In [102]:
    
(ddf + 1).sum()visualize()
    
    Out[102]:
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [87]:
    
df.head(n=2)
    
    Out[87]:
In [143]:
    
task = ddf.head(n=2, npartitions=2, compute=False)
    
    
In [91]:
    
task.visualize()
    
    Out[91]:
In [92]:
    
task.dask
    
    Out[92]:
In [142]:
    
task._keys()
    
    Out[142]:
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [65]:
    
print(pd.DataFrame.__doc__)
    
    
In [66]:
    
print(dd.DataFrame.__doc__)
    
    
In [ ]:
    
dd.from_pandas()
    
In [ ]:
    
    
In [52]:
    
ddf = dd.from_pandas(df, chunksize=2)
task = ddf[ddf.a>2]
    
In [53]:
    
task.compute()
    
    Out[53]:
In [54]:
    
task.visualize()
    
    Out[54]:
In [55]:
    
print(dd.DataFrame.__doc__)
    
    
In [56]:
    
task._meta
    
    Out[56]:
In [57]:
    
task.npartitions
task.divisions
    
    Out[57]:
    Out[57]:
In [58]:
    
task._name
    
    Out[58]:
In [59]:
    
task.dask
    
    Out[59]:
In [60]:
    
task.dask[(task._name,0)]
    
    Out[60]:
In [61]:
    
task.dask[(task._name,1)]
    
    Out[61]:
In [62]:
    
task.compute??
    
In [48]:
    
task2.compute()
    
    Out[48]:
In [49]:
    
task2.visualize()
    
    Out[49]:
In [51]:
    
task2.dask[(task2._name,0)]
    
    Out[51]:
In [ ]: