notebook.community

Edit and run



In [1]:

    
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, visualize
import bokeh.plotting as bp



In [2]:

    
ProgressBar().register()
bp.output_notebook()









    





    
        
        Loading BokehJS ...



In [3]:

    
cols = ['year', 'month', 'day_of_month', 'day_of_week', 'deptime', 'crs_deptime', 'arrtime', 
        'crs_arrtime', 'unique_carrier', 'flight_num', 'tail_num', 'actual_elapsed_time',
        'crs_elapsed_time', 'air_time', 'arrdelay', 'depdelay', 'origin', 'dest', 'distance', 
        'taxi_in', 'taxi_out', 'cancelled', 'cancellation_code', 'diverted', 'carrier_delay',
        'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

dtypes = {'cancellation_code': object, 'taxi_in': float, 'taxi_out': float, 'cancelled': bool,
          'diverted': bool, 'carrier_delay': float, 'weather_delay': float, 'nas_delay': float,
          'security_delay': float, 'late_aircraft_delay': float, 'tail_num': object,
          'crs_deptime': float, 'crs_arrtime': float, 'flight_num': float, 'crs_elapsed_time': float,
          'distance': float}



In [4]:

    
#reads file from external SSD because of 19*.csv files account for 6,3GB
df = dd.read_csv('/Volumes/Samsung_T5/data/flights/19*.csv', 
                 header=0, 
                 names=cols, 
                 dtype=dtypes)



In [5]:

    
df.head(2)









    



[########################################] | 100% Completed |  2.0s






    Out[5]:







  
    
      
      year
      month
      day_of_month
      day_of_week
      deptime
      crs_deptime
      arrtime
      crs_arrtime
      unique_carrier
      flight_num
      ...
      taxi_in
      taxi_out
      cancelled
      cancellation_code
      diverted
      carrier_delay
      weather_delay
      nas_delay
      security_delay
      late_aircraft_delay
    
  
  
    
      0
      1987
      10
      14
      3
      741.0
      730.0
      912.0
      849.0
      PS
      1451.0
      ...
      NaN
      NaN
      False
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      1987
      10
      15
      4
      729.0
      730.0
      903.0
      849.0
      PS
      1451.0
      ...
      NaN
      NaN
      False
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

2 rows × 29 columns



In [6]:

    
df.tail(2)









    



[########################################] | 100% Completed |  1.3s






    Out[6]:







  
    
      
      year
      month
      day_of_month
      day_of_week
      deptime
      crs_deptime
      arrtime
      crs_arrtime
      unique_carrier
      flight_num
      ...
      taxi_in
      taxi_out
      cancelled
      cancellation_code
      diverted
      carrier_delay
      weather_delay
      nas_delay
      security_delay
      late_aircraft_delay
    
  
  
    
      404693
      1999
      12
      19
      7
      1907.0
      1910.0
      2114.0
      2117.0
      AA
      1359.0
      ...
      11.0
      13.0
      False
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      404694
      1999
      12
      20
      1
      1905.0
      1910.0
      2127.0
      2117.0
      AA
      1359.0
      ...
      15.0
      8.0
      False
      NaN
      False
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

2 rows × 29 columns



In [7]:

    
#The dask DataFrame is partitioned into chunks along the index. To see how many partitions, you can use the npartitions attribute.
df.npartitions









    Out[7]:





103



In [8]:

    
with Profiler() as prof, ResourceProfiler() as rprof:
    count_flights = df.count().compute()
print(count_flights)
visualize([prof, rprof])









    



[########################################] | 100% Completed |  1min 49.1s
year                   64249512
month                  64249512
day_of_month           64249512
day_of_week            64249512
deptime                63213091
crs_deptime            64249512
arrtime                63054229
crs_arrtime            64249512
unique_carrier         64249512
flight_num             64249512
tail_num               27003866
actual_elapsed_time    63054229
crs_elapsed_time       64225390
air_time               26375480
arrdelay               63054229
depdelay               63213091
origin                 64249512
dest                   64249512
distance               64047512
taxi_in                27003866
taxi_out               27003866
cancelled              64249512
cancellation_code             0
diverted               64249512
carrier_delay                 0
weather_delay                 0
nas_delay                     0
security_delay                0
late_aircraft_delay           0
dtype: int64






    






    







    














    Out[8]:




Column(
id = 'bde2cc4d-f864-4127-b902-9ba67d07f664', …)
children = [ToolbarBox(id='a40bb820-7a17-4750-ba9a-16e521ec6fd7', ...), Column(id='5f028db0-ceda-4de7-a08b-dbfdb0a61a35', ...)],
css_classes = [],
disabled = False,
height = None,
js_event_callbacks = {},
js_property_callbacks = {},
name = None,
sizing_mode = 'fixed',
subscribed_events = [],
tags = [],
width = None)



In [9]:

    
with Profiler() as prof, ResourceProfiler() as rprof:
    top_avg_depdelay = df.depdelay.groupby(df.origin).mean().nlargest(10).compute()
print(top_avg_depdelay)
visualize([prof, rprof])









    



[########################################] | 100% Completed |  1min 47.4s
origin
PIR    85.000000
ABI    36.500000
BFI    28.000000
ACV    19.740223
YAP    19.099857
ACK    18.723881
RDD    15.724234
FCA    14.939219
SUN    14.538462
DUT    13.630278
Name: depdelay, dtype: float64






    






    







    














    Out[9]:




Column(
id = '3e300424-a7ca-4808-9549-22f2fe468d3e', …)
children = [ToolbarBox(id='96160ecd-3cac-401f-884d-4522ab0b1d34', ...), Column(id='19a7c95d-f6d9-4244-bb12-6d054b3a95b2', ...)],
css_classes = [],
disabled = False,
height = None,
js_event_callbacks = {},
js_property_callbacks = {},
name = None,
sizing_mode = 'fixed',
subscribed_events = [],
tags = [],
width = None)



In [ ]:

	year	month	day_of_month	day_of_week	deptime	crs_deptime	arrtime	crs_arrtime	unique_carrier	flight_num	...	taxi_in	taxi_out	cancelled	cancellation_code	diverted	carrier_delay	weather_delay	nas_delay	security_delay	late_aircraft_delay
0	1987	10	14	3	741.0	730.0	912.0	849.0	PS	1451.0	...	NaN	NaN	False	NaN	False	NaN	NaN	NaN	NaN	NaN
1	1987	10	15	4	729.0	730.0	903.0	849.0	PS	1451.0	...	NaN	NaN	False	NaN	False	NaN	NaN	NaN	NaN	NaN

	year	month	day_of_month	day_of_week	deptime	crs_deptime	arrtime	crs_arrtime	unique_carrier	flight_num	...	taxi_in	taxi_out	cancelled	cancellation_code	diverted	carrier_delay	weather_delay	nas_delay	security_delay	late_aircraft_delay
404693	1999	12	19	7	1907.0	1910.0	2114.0	2117.0	AA	1359.0	...	11.0	13.0	False	NaN	False	NaN	NaN	NaN	NaN	NaN
404694	1999	12	20	1	1905.0	1910.0	2127.0	2117.0	AA	1359.0	...	15.0	8.0	False	NaN	False	NaN	NaN	NaN	NaN	NaN