In [1]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, visualize
import bokeh.plotting as bp

In [2]:
ProgressBar().register()
bp.output_notebook()


Loading BokehJS ...

In [3]:
cols = ['year', 'month', 'day_of_month', 'day_of_week', 'deptime', 'crs_deptime', 'arrtime', 
        'crs_arrtime', 'unique_carrier', 'flight_num', 'tail_num', 'actual_elapsed_time',
        'crs_elapsed_time', 'air_time', 'arrdelay', 'depdelay', 'origin', 'dest', 'distance', 
        'taxi_in', 'taxi_out', 'cancelled', 'cancellation_code', 'diverted', 'carrier_delay',
        'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

dtypes = {'cancellation_code': object, 'taxi_in': float, 'taxi_out': float, 'cancelled': bool,
          'diverted': bool, 'carrier_delay': float, 'weather_delay': float, 'nas_delay': float,
          'security_delay': float, 'late_aircraft_delay': float, 'tail_num': object,
          'crs_deptime': float, 'crs_arrtime': float, 'flight_num': float, 'crs_elapsed_time': float,
          'distance': float}

In [4]:
#reads file from external SSD because of 19*.csv files account for 6,3GB
df = dd.read_csv('/Volumes/Samsung_T5/data/flights/19*.csv', 
                 header=0, 
                 names=cols, 
                 dtype=dtypes)

In [5]:
df.head(2)


[########################################] | 100% Completed |  2.0s
Out[5]:
year month day_of_month day_of_week deptime crs_deptime arrtime crs_arrtime unique_carrier flight_num ... taxi_in taxi_out cancelled cancellation_code diverted carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
0 1987 10 14 3 741.0 730.0 912.0 849.0 PS 1451.0 ... NaN NaN False NaN False NaN NaN NaN NaN NaN
1 1987 10 15 4 729.0 730.0 903.0 849.0 PS 1451.0 ... NaN NaN False NaN False NaN NaN NaN NaN NaN

2 rows × 29 columns


In [6]:
df.tail(2)


[########################################] | 100% Completed |  1.3s
Out[6]:
year month day_of_month day_of_week deptime crs_deptime arrtime crs_arrtime unique_carrier flight_num ... taxi_in taxi_out cancelled cancellation_code diverted carrier_delay weather_delay nas_delay security_delay late_aircraft_delay
404693 1999 12 19 7 1907.0 1910.0 2114.0 2117.0 AA 1359.0 ... 11.0 13.0 False NaN False NaN NaN NaN NaN NaN
404694 1999 12 20 1 1905.0 1910.0 2127.0 2117.0 AA 1359.0 ... 15.0 8.0 False NaN False NaN NaN NaN NaN NaN

2 rows × 29 columns


In [7]:
#The dask DataFrame is partitioned into chunks along the index. To see how many partitions, you can use the npartitions attribute.
df.npartitions


Out[7]:
103

In [8]:
with Profiler() as prof, ResourceProfiler() as rprof:
    count_flights = df.count().compute()
print(count_flights)
visualize([prof, rprof])


[########################################] | 100% Completed |  1min 49.1s
year                   64249512
month                  64249512
day_of_month           64249512
day_of_week            64249512
deptime                63213091
crs_deptime            64249512
arrtime                63054229
crs_arrtime            64249512
unique_carrier         64249512
flight_num             64249512
tail_num               27003866
actual_elapsed_time    63054229
crs_elapsed_time       64225390
air_time               26375480
arrdelay               63054229
depdelay               63213091
origin                 64249512
dest                   64249512
distance               64047512
taxi_in                27003866
taxi_out               27003866
cancelled              64249512
cancellation_code             0
diverted               64249512
carrier_delay                 0
weather_delay                 0
nas_delay                     0
security_delay                0
late_aircraft_delay           0
dtype: int64
Out[8]:
Column(
id = 'bde2cc4d-f864-4127-b902-9ba67d07f664', …)

In [9]:
with Profiler() as prof, ResourceProfiler() as rprof:
    top_avg_depdelay = df.depdelay.groupby(df.origin).mean().nlargest(10).compute()
print(top_avg_depdelay)
visualize([prof, rprof])


[########################################] | 100% Completed |  1min 47.4s
origin
PIR    85.000000
ABI    36.500000
BFI    28.000000
ACV    19.740223
YAP    19.099857
ACK    18.723881
RDD    15.724234
FCA    14.939219
SUN    14.538462
DUT    13.630278
Name: depdelay, dtype: float64
Out[9]:
Column(
id = '3e300424-a7ca-4808-9549-22f2fe468d3e', …)

In [ ]: