In [8]:

    
import pandas as pd 
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import matplotlib.gridspec as gridspec
import scipy as sp

%matplotlib inline

This notebook will be describing an analysis of US domestic flight data. We will first take a dive into a single year, 2008 and do some basic analysis. Then we may expand this into multiple years to try and see trends. Finally we will use a single year to build a predictive model to predict weather or not a flight will be delayed or not.



In [12]:

    
df_flights_2008 = pd.read_csv('/home/hakim/Documents/Airline_delays-/flight_data_historical/2008.csv')
print(df_flights_2008.tail())









    



         Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
7009723  2008     12          13          6   1002.0         959   1204.0   
7009724  2008     12          13          6    834.0         835   1021.0   
7009725  2008     12          13          6    655.0         700    856.0   
7009726  2008     12          13          6   1251.0        1240   1446.0   
7009727  2008     12          13          6   1110.0        1103   1413.0   

         CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  \
7009723        1150            DL       1636        ...            6.0   
7009724        1023            DL       1637        ...            5.0   
7009725         856            DL       1638        ...           24.0   
7009726        1437            DL       1639        ...           13.0   
7009727        1418            DL       1641        ...            8.0   

         TaxiOut  Cancelled  CancellationCode  Diverted  CarrierDelay  \
7009723     45.0          0               NaN         0           NaN   
7009724     23.0          0               NaN         0           NaN   
7009725     12.0          0               NaN         0           NaN   
7009726     13.0          0               NaN         0           NaN   
7009727     11.0          0               NaN         0           NaN   

        WeatherDelay NASDelay  SecurityDelay  LateAircraftDelay  
7009723          NaN      NaN            NaN                NaN  
7009724          NaN      NaN            NaN                NaN  
7009725          NaN      NaN            NaN                NaN  
7009726          NaN      NaN            NaN                NaN  
7009727          NaN      NaN            NaN                NaN  

[5 rows x 29 columns]



In [13]:

    
df_flights_2008.shape









    Out[13]:





(7009728, 29)



In [14]:

    
df_flights_2008.isnull().sum(axis=0)









    Out[14]:





Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               136246
CRSDepTime                 0
ArrTime               151649
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                83365
ActualElapsedTime     154699
CRSElapsedTime           844
AirTime               154699
ArrDelay              154699
DepDelay              136246
Origin                     0
Dest                       0
Distance                   0
TaxiIn                151649
TaxiOut               137058
Cancelled                  0
CancellationCode     6872294
Diverted                   0
CarrierDelay         5484993
WeatherDelay         5484993
NASDelay             5484993
SecurityDelay        5484993
LateAircraftDelay    5484993
dtype: int64

Next thing we would like to do is remove cancelled flights. ALthough they can be interesting for other analysis for ours they are not needed. As a side note only 0.02% of flights were cancelled



In [15]:

    
df = df_flights_2008
df = df[df.Cancelled != 1]

We will also remove diverted flights. For this analysis we would like to only focus on delays.



In [16]:

    
df = df[df.Diverted != 1]
df.isnull().sum(axis=0)









    Out[16]:





Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                    5
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6855029
Diverted                   0
CarrierDelay         5330294
WeatherDelay         5330294
NASDelay             5330294
SecurityDelay        5330294
LateAircraftDelay    5330294
dtype: int64

What we have left are all non-cancelled diverted flights. Therefore we can remove the columns related to that. Also as we can see by looking at the numbers about 76% percent of flights were on time. That is not great.



In [17]:

    
df = df.drop(['Cancelled', 'Diverted', 'CancellationCode'], axis = 1)

If a flight is more than 10 minutes late for leaving or arriving I have marked it with a boolean variable below.



In [43]:

    
conditionsArr = [(df['ArrDelay'] > 10)]
choicesArr = [1]
df['isDelArr'] = np.select(conditionsArr, choicesArr, default = 0)

conditionsDep = [(df['DepDelay'] > 10)]
choicesDep = [1]
df['isDelDep'] = np.select(conditionsDep, choicesDep, default = 0)



In [58]:

    
ts = df[['isDelArr', 'isDelDep']].sum().to_frame()
ts.columns = ['Count']

sns.barplot(x = ts.index , y = ts.Count);

Now we will only focus on the top 10 airports. They most likely generate the most revenue.



In [59]:

    
dforig = pd.value_counts(df.Origin, sort=True).to_frame().reset_index()
dforig.columns = ['Origin','Count']


topdforig = dforig.head(n=10)
topdforig.set_index('Origin',inplace = True)
df = df[df['Origin'].isin(topdforig.index)]


grouped = df.groupby('Origin').sum()
grouped = grouped[['isDelArr','isDelDep']]
grouped['DelArrRa'] = grouped.isDelArr 
grouped['DelDepRa'] = grouped.isDelDep 


gs = gridspec.GridSpec(3, 1)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])

sns.barplot(x = grouped.index, y = grouped.DelArrRa, ax = ax1);
sns.barplot(x = grouped.index, y = grouped.DelDepRa, ax = ax2);
sns.barplot(x = topdforig.index, y = topdforig.Count, ax = ax3);



In [21]:

    
pd.options.display.float_format = '{:.2f}'.format
df[['ActualElapsedTime', 'ArrDelay', 'DepDelay', 'Distance']].describe()









    Out[21]:







  
    
      
      ActualElapsedTime
      ArrDelay
      DepDelay
      Distance
    
  
  
    
      count
      2311651.00
      2311651.00
      2311651.00
      2311651.00
    
    
      mean
      129.85
      9.05
      10.97
      769.84
    
    
      std
      67.42
      36.35
      33.15
      560.81
    
    
      min
      15.00
      -519.00
      -534.00
      24.00
    
    
      25%
      80.00
      -9.00
      -4.00
      352.00
    
    
      50%
      114.00
      -1.00
      0.00
      612.00
    
    
      75%
      160.00
      13.00
      10.00
      997.00
    
    
      max
      837.00
      1395.00
      1403.00
      4502.00



In [13]:

    
gs = gridspec.GridSpec(2, 2)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax4 = plt.subplot(gs[3])
sns.distplot(df.ActualElapsedTime, ax = ax1);
sns.distplot(df.ArrDelay, ax = ax2);
sns.distplot(df.DepDelay, ax = ax3);
sns.distplot(df.Distance, ax = ax4);



In [14]:

    
dforig = pd.value_counts(df.Origin, sort=True).to_frame().reset_index()
dforig.columns = ['Origin','Count']

topdforig = dforig.head(n=10)
topdforig.set_index('Origin',inplace = True)
df = df[df['Origin'].isin(topdforig.index)]


dfa = pd.value_counts(df.UniqueCarrier, sort=True).to_frame().reset_index().head(n=10)
dfa.columns = ['Carrier','Count']
dfa.set_index('Carrier', inplace = True)
df = df[df['UniqueCarrier'].isin(dfa.index)]

delay_type_sum =  df[['UniqueCarrier','CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].groupby(df.UniqueCarrier).sum().dropna()

flights = df.groupby('UniqueCarrier').count()
flights = flights[flights.index.isin(delay_type_sum.index)]
flights['Number_Flights'] = flights.Year
flights = flights[[ 'Number_Flights']]
flights.dropna(inplace = True)

delay_per_flight = delay_type_sum[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].div(flights.Number_Flights, axis='index')

delay_per_flight.describe()









    Out[14]:







  
    
      
      CarrierDelay
      WeatherDelay
      NASDelay
      SecurityDelay
      LateAircraftDelay
    
  
  
    
      count
      10.00
      10.00
      10.00
      10.00
      10.00
    
    
      mean
      4.20
      0.79
      3.56
      0.02
      4.33
    
    
      std
      1.40
      0.59
      1.40
      0.01
      1.81
    
    
      min
      2.12
      0.16
      1.42
      0.00
      2.14
    
    
      25%
      3.33
      0.46
      2.88
      0.00
      3.20
    
    
      50%
      4.06
      0.58
      3.48
      0.01
      4.20
    
    
      75%
      4.79
      1.02
      4.64
      0.02
      5.21
    
    
      max
      7.12
      2.17
      5.73
      0.05
      8.23



In [15]:

    
gs = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax4 = plt.subplot(gs[3])
ax5 = plt.subplot(gs[4])

sns.pointplot(x= delay_per_flight.index, y='CarrierDelay', data=delay_per_flight, ax = ax1)
sns.pointplot(x= delay_per_flight.index, y='WeatherDelay', data=delay_per_flight, ax = ax2)
sns.pointplot(x= delay_per_flight.index, y='NASDelay', data=delay_per_flight, ax = ax3)
sns.pointplot(x= delay_per_flight.index, y='SecurityDelay', data=delay_per_flight, ax = ax4);
sns.pointplot(x= delay_per_flight.index, y='LateAircraftDelay', data=delay_per_flight, ax = ax5);



In [61]:

    
dfWN = df[df['UniqueCarrier']=='WN']
delayed_per_month = dfWN[['isDelArr', 'Month']].groupby('Month').sum()
delayed_ratio_pm = delayed_per_month / dfWN[['isDelArr', 'Month']].groupby('Month').count()

sns.barplot(x = delayed_per_month.index, y = delayed_ratio_pm.isDelArr);



In [64]:

    
delayed_per_day = dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').sum()
delayed_ratio_pd = delayed_per_day / dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').count()
sns.barplot(x = delayed_per_day.index, y = delayed_ratio_pd.isDelArr);

Now we will start building a predictive model to predict if a plane will be delayed or not. We will start with a simple logistic regression. If it is not good enough we will move onto more complicated algorithms like SVM.



In [184]:

    
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier



In [182]:

    
#defining our target variable
y = dfWN.isDelArr
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum', 'ActualElapsedTime', 'Origin', 'Dest', 'Distance']]

for col in ['Month','DayofMonth', 'DayOfWeek']:
    X[col] = X[col].astype('category')


X_dum = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)









    



/home/hakim/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  






    



(197604, 670) (197604,)
(49401, 670) (49401,)



In [68]:

    
model = LogisticRegression()
model = model.fit(X_train, y_train)

# check the accuracy on the training set
print(model.score(X_train, y_train),
model.score(X_test, y_test))









    



0.795823971175 0.793485961823

The test and training errors are very similar this means the model is most likely bias. For this model since 30% of flights are normally delayed the accuracy is really only 4% better than random guessing. That is not very good. We need to play around a bit. We will try some transformations next



In [26]:

    
X_dum[['DepTime_2', 'ArrTime_2', 'ActualElapsedTime_2','Distance_2']] = np.log(X_dum[['DepTime', 'ArrTime', 'ActualElapsedTime','Distance']])



In [43]:

    
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.3)

#model = LogisticRegression()
#model = model.fit(X_train, y_train)

# check the accuracy on the training set
#print(model.score(X_train, y_train),
#model.score(X_test, y_test))



In [ ]:

    
clf = svm.SVC(kernel='linear', C=1, random_state=0)
clf = clf.fit(X_train.head(n=10000), y_train.head(n=10000))
clf.score(X_test, y_test)

Transformations did not lead to better results so we will move on to a more complicated model like SVM.



In [69]:

    
clf = RandomForestClassifier(n_jobs=2, random_state=0,  max_features = None, oob_score=True)
clf = clf.fit(X_train, y_train)
print(clf.oob_score_,clf.score(X_test, y_test))









    



/home/hakim/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/home/hakim/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])






    



0.803784336349 0.819295155969



In [192]:

    
clf = MLPClassifier(activation='tanh', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(100, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
clf = clf.fit(X_train, y_train)
clf.score(X_test,y_test)









    Out[192]:





0.76688730997348231



In [146]:

    
import sqlite3

conn = sqlite3.connect("/home/hakim/Documents/Airline_delays-/flight_data_historical/delayed.sqlite3")



In [120]:

    
flightcount = pd.read_sql_query("select distinct Year ,count(Origin) as number_of_flights from delayed where Cancelled != 1 group by Year;", conn)
flightcount.set_index('Year', inplace = True)



In [124]:

    
fc = sns.pointplot(x = flightcount.index, y = flightcount.number_of_flights)
fc.set_xticklabels(flightcount.index, rotation=90);



In [128]:

    
flightscancelled = pd.read_sql_query("select distinct Year ,sum(Cancelled) as canceled_flights from delayed group by Year;", conn)
flightscancelled.set_index('Year', inplace = True)



In [ ]:

    
delflcount = pd.read_sql_query("select distinct Year ,count(Origin) as number_of_flights from delayed where Cancelled != 1 group by Year;", conn)



In [134]:

    
fcl = sns.pointplot(x = flightscancelled.index, y = (flightscancelled.canceled_flights /flightcount.number_of_flights) * 100)
fcl.set_xticklabels(flightscancelled.index, rotation=90);



In [136]:

    
flightsdiverted = pd.read_sql_query("select distinct Year ,sum(Diverted) as Diverted from delayed group by Year;", conn)
flightsdiverted.set_index('Year', inplace = True)



In [140]:

    
dd = sns.pointplot(x = flightsdiverted.index, y = flightsdiverted.Diverted /flightcount.number_of_flights)
dd.set_xticklabels(flightsdiverted.index, rotation=90);



In [153]:

    
delayarr = pd.read_sql_query("select distinct year,sum(ArrDelay) as ArrDelay from delayed where ArrDelay >= 30 group by year;", conn)
delayarr.set_index('Year', inplace = True)



In [157]:









    Out[157]:





Year
1987    6.32
1988    5.12
1989    6.50
1990    5.84
1991    4.50
1992    4.26
1993    4.84
1994    5.13
1995    6.38
1996    8.49
1997    7.02
1998    8.15
1999    8.84
2000   10.81
2001    7.70
2002    6.16
2003    6.56
2004    8.65
2005    9.09
2006   10.47
2007   11.81
2008   10.78
dtype: float64



In [154]:

    
da = sns.pointplot(x = delayarr.index, y = delayarr.ArrDelay /flightcount.number_of_flights)
da.set_xticklabels(delayarr.index, rotation=90);



In [102]:

    
delaytrendweather = pd.read_sql_query("select distinct year,sum(WeatherDelay) as weatherdelay from delayed group by year;", conn)
delaytrendweather.set_index('Year', inplace = True)



In [145]:

    
dw = sns.pointplot(x = delaytrendweather.index, y = delaytrendweather.weatherdelay)
dw.set_xticklabels(delaytrendweather.index, rotation=90);



In [133]:

    
delaytrendsecuirty = pd.read_sql_query("select sum(SecurityDelay) from delayed group by year;", conn)



In [144]:

    
delaytrendsecuirty.columns=['SecurityDelay']
sns.pointplot(x = delaytrendsecuirty.index, y = delaytrendsecuirty.SecurityDelay);



In [141]:

    
delaytrendlateflight= pd.read_sql_query("select sum(LateAircraftDelay) from delayed group by year;", conn)



In [148]:

    
delaytrendlateflight.columns =['LateAircraftDelay']
sns.pointplot(x = delaytrendlateflight.index, y = delaytrendlateflight.LateAircraftDelay);



In [179]:

    
topar = pd.read_sql_query("select Year, Origin, (select count(Origin) as count_origin from (select count(Origin) from delayed as b where b.Year = a.Year limit 1) ) as count_origin from delayed as a group by Year;", conn)



In [ ]:

	ActualElapsedTime	ArrDelay	DepDelay	Distance
count	2311651.00	2311651.00	2311651.00	2311651.00
mean	129.85	9.05	10.97	769.84
std	67.42	36.35	33.15	560.81
min	15.00	-519.00	-534.00	24.00
25%	80.00	-9.00	-4.00	352.00
50%	114.00	-1.00	0.00	612.00
75%	160.00	13.00	10.00	997.00
max	837.00	1395.00	1403.00	4502.00