In [1]:
import pandas as pd
df = pd.read_csv('data/airline_delay_causes_2015.csv')
df.head()
Out[1]:
In [2]:
df.columns = df.columns.str.strip()
Since month in dimple only except 2 digit format, change this
In [3]:
df['month'] = df['month'].map(lambda x: '0' + str(x) if len(str(x)) < 2 else x)
In [4]:
df.month.unique()
Out[4]:
We want to have total number of operations and total minutes delay. So we're going to aggregate it per month.
In [10]:
agg_month_sum = df.groupby('month',as_index=False).sum()
not_ontime_flights = ['arr_cancelled','arr_diverted','arr_del15']
agg_month_sum['on_time_flights'] = agg_month_sum['arr_flights'] - agg_month_sum[not_ontime_flights].sum(axis=1)
delayed_columns = agg_month_sum.columns[agg_month_sum.columns.str.endswith('_delay')]
agg_month_sum[delayed_columns] = agg_month_sum[delayed_columns].applymap(lambda x: x/60)
agg_month_sum.to_csv('agg_month_sum_airlines_2015.csv_',index=False)
In [10]:
%matplotlib inline
In [16]:
df[df.month == 6].groupby('carrier_name').sum().T.plot()
Out[16]:
In [7]:
df['delay_minutes_per_delayed_flight'] = (df.carrier_delay / df.carrier_ct)
date_df = df.groupby(['carrier_name','month'],as_index=False).delay_minutes_per_delayed_flight.mean()
In [9]:
date_df.to_csv('carr_delay_2015.csv_',index=False)
In [ ]:
In [40]:
%matplotlib inline
In [ ]:
df.groupby('carr_delay_2015.csv_')
In [44]:
1206011 / 19579
Out[44]: