In [1]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, visualize
import bokeh.plotting as bp
In [2]:
ProgressBar().register()
bp.output_notebook()
In [3]:
cols = ['year', 'month', 'day_of_month', 'day_of_week', 'deptime', 'crs_deptime', 'arrtime',
'crs_arrtime', 'unique_carrier', 'flight_num', 'tail_num', 'actual_elapsed_time',
'crs_elapsed_time', 'air_time', 'arrdelay', 'depdelay', 'origin', 'dest', 'distance',
'taxi_in', 'taxi_out', 'cancelled', 'cancellation_code', 'diverted', 'carrier_delay',
'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
dtypes = {'cancellation_code': object, 'taxi_in': float, 'taxi_out': float, 'cancelled': bool,
'diverted': bool, 'carrier_delay': float, 'weather_delay': float, 'nas_delay': float,
'security_delay': float, 'late_aircraft_delay': float, 'tail_num': object,
'crs_deptime': float, 'crs_arrtime': float, 'flight_num': float, 'crs_elapsed_time': float,
'distance': float}
In [4]:
#reads file from external SSD because of 19*.csv files account for 6,3GB
df = dd.read_csv('/Volumes/Samsung_T5/data/flights/19*.csv',
header=0,
names=cols,
dtype=dtypes)
In [5]:
df.head(2)
Out[5]:
In [6]:
df.tail(2)
Out[6]:
In [7]:
#The dask DataFrame is partitioned into chunks along the index. To see how many partitions, you can use the npartitions attribute.
df.npartitions
Out[7]:
In [8]:
with Profiler() as prof, ResourceProfiler() as rprof:
count_flights = df.count().compute()
print(count_flights)
visualize([prof, rprof])
Out[8]:
In [9]:
with Profiler() as prof, ResourceProfiler() as rprof:
top_avg_depdelay = df.depdelay.groupby(df.origin).mean().nlargest(10).compute()
print(top_avg_depdelay)
visualize([prof, rprof])
Out[9]:
In [ ]: