This report will be about Delay metrics for our Airline, SouthWest. We will start out with a "State of the industry". This will involve historical trends in the industry as a whole as well of statistics of the current year. Then we will only focus on the top ten airports. We will then do a deep dive into our own numbers, predicting whether or not our planes will be delayed.
In [1]:
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import matplotlib.gridspec as gridspec
import scipy as sp
import sqlite3
import matplotlib.style as style
style.use('fivethirtyeight')
%matplotlib inline
In [2]:
conn = sqlite3.connect("/home/hakim/Documents/Southwest_Report/flight_data_historical/delayed.sqlite3")
df = pd.read_csv('/home/hakim/Documents/Southwest_Report/flight_data_historical/2008.csv')
conditionsArr = [(df['ArrDelay'] > 15)]
choicesArr = [1]
df['isDelArr'] = np.select(conditionsArr, choicesArr, default = 0)
conditionsDep = [(df['DepDelay'] > 15)]
choicesDep = [1]
df['isDelDep'] = np.select(conditionsDep, choicesDep, default = 0)
flightcount = pd.read_sql_query("select distinct Year ,count(Origin) as number_of_flights from delayed where Cancelled != 1 group by Year;", conn)
flightcount.set_index('Year', inplace = True)
In [3]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fc = sns.pointplot(x = flightcount.index, y = flightcount.number_of_flights);
fc.set_xticklabels(flightcount.index, rotation=90);
fc.set(xlabel='Year', ylabel='Number of Flights');
fc.set_title('Number of Flights Per Year 1987-2008');
In [4]:
can = pd.read_sql_query("select Year, sum(Cancelled) as Number_Cancelled from delayed group by year;", conn)
can.set_index('Year', inplace = True)
In [5]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fcan = sns.pointplot(x = can.index, y = can.Number_Cancelled / flightcount.number_of_flights * 100)
fcan.set_xticklabels(can.index, rotation=90);
fcan.set(xlabel='Year', ylabel='Percent of Flights Cancelled');
fcan.set_title('Percent of Flights Cancelled Per Year 1987-2008');
In [6]:
flightsdiverted = pd.read_sql_query("select distinct Year ,sum(Diverted) as Diverted from delayed group by Year;", conn)
flightsdiverted.set_index('Year', inplace = True)
In [7]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fdiv = sns.barplot(x = flightsdiverted.index, y = flightsdiverted.Diverted / flightcount.number_of_flights * 100);
fdiv.set_xticklabels(flightsdiverted.index, rotation=90);
fdiv.set(xlabel='Year', ylabel='Percent of Flights Diverted');
fdiv.set_title('Percent of Flights Diverted Per Year 1987-2008');
In [8]:
delayarr = pd.read_sql_query("select distinct year,avg(ArrDelay) as ArrDelay from delayed where ArrDelay group by Year;", conn)
delayarr.set_index('Year', inplace = True)
In [9]:
delayarrWN = pd.read_sql_query("select distinct year,avg(ArrDelay) as ArrDelay from delayed where UniqueCarrier='WN' group by Year;", conn)
delayarrWN.set_index('Year', inplace = True)
In [10]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.plot(delayarrWN.index, delayarrWN.ArrDelay)
plt.plot(delayarrWN.index, delayarr.ArrDelay)
plt.legend(['South West', 'Industry Average'], loc='upper left')
plt.xticks(np.arange(min(delayarrWN.index), max(delayarrWN.index)+1, 1.0))
fig.suptitle('Average Minutes Per Flight Arrival Delay Per Year 1987-2008', fontsize=20)
plt.xlabel('Year', fontsize=18)
plt.ylabel('Minutes per Flight', fontsize=16)
plt.show()
In [11]:
delaydep = pd.read_sql_query("select distinct year,avg(DepDelay) as Delay from delayed where DepDelay group by Year;", conn)
delaydep.set_index('Year', inplace = True)
In [12]:
delaydepWN = pd.read_sql_query("select distinct year,avg(DepDelay) as DepDelay from delayed where UniqueCarrier='WN' group by Year;", conn)
delaydepWN.set_index('Year', inplace = True)
In [13]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.plot(delayarrWN.index, delaydepWN.DepDelay)
plt.plot(delayarrWN.index, delaydep.Delay)
plt.legend(['South West', 'Industry Average'], loc='upper right')
plt.xticks(np.arange(min(delayarrWN.index), max(delayarrWN.index)+1, 1.0))
fig.suptitle('Average Minutes Per Flight Departure Delay Per Year 1987-2008', fontsize=20)
plt.xlabel('Year', fontsize=18)
plt.ylabel('Minutes per Flight', fontsize=16)
plt.show()
In [14]:
delaytrendweather = pd.read_sql_query("select distinct year,sum(WeatherDelay) as weatherdelay from delayed group by year;", conn)
delaytrendweather.set_index('Year', inplace = True)
In [15]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fdw = sns.pointplot(x = delaytrendweather.index, y = delaytrendweather.weatherdelay/60)
fdw.set_xticklabels(delaytrendweather.index, rotation=90)
fdw.set(xlabel='Year', ylabel='Hours')
fdw.set_title('Total Hours of Delay due to Weather Per Year 1987-2008');
In [16]:
delaytrendlateflight= pd.read_sql_query("select Year, sum(CarrierDelay) as delay from delayed group by Year;", conn)
delaytrendlateflight.set_index('Year', inplace=True)
In [17]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fdlf = sns.pointplot(x = delaytrendlateflight.index, y = delaytrendlateflight.delay/60)
fdlf.set_xticklabels(delaytrendlateflight.index, rotation=90)
fdlf.set(xlabel='Year', ylabel='Hours')
fdlf.set_title('Total Hours of Delay due to Carrier Per Year 1987-2008');
In [18]:
delaytrendsecurity = pd.read_sql_query("select Year, sum(SecurityDelay) as delay from delayed group by Year;", conn)
delaytrendsecurity .set_index('Year', inplace = True)
In [19]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
fds = sns.pointplot(x = delaytrendsecurity.index, y = delaytrendsecurity.delay/60)
fds.set_xticklabels(delaytrendsecurity.index, rotation=90)
fds.set(xlabel='Year', ylabel='Hours')
fds.set_title('Total Hours of Delay due to Security Per Year 1987-2008');
In [20]:
delarr = df[['isDelArr', 'isDelDep']].sum().to_frame()
delarr.columns = ['Count']
In [21]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
dela = sns.barplot(x = delarr.index , y = delarr.Count);
dela.set(xlabel='Count', ylabel='Delay Type')
dela.set_title('Number of Delayed Flights');
In [22]:
pd.options.display.float_format = '{:.2f}'.format
fig=plt.figure(figsize=(13, 10), dpi= 80, facecolor='w', edgecolor='k')
gs = gridspec.GridSpec(2, 2)
gs.update(wspace = 0.25, hspace = 0.25)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax4 = plt.subplot(gs[3])
sns.distplot(df.ActualElapsedTime.dropna(), ax = ax1);
sns.distplot(df.ArrDelay.dropna(), ax = ax2);
sns.distplot(df.DepDelay.dropna(), ax = ax3);
sns.distplot(df.Distance.dropna(), ax = ax4);
In [23]:
dfcar = pd.value_counts(df.UniqueCarrier, sort=True).to_frame().reset_index()
dfcar.columns = ['Carrier','Count']
topdfcar = dfcar.head(n=10)
topdfcar.set_index('Carrier',inplace = True)
In [24]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
topcar = sns.barplot(x = topdfcar.index, y = topdfcar.Count)
topcar.set(xlabel='Carriers', ylabel='Number of Flights')
topcar.set_title('Number of Flights of Top 10 Carriers');
In [25]:
dforig = pd.value_counts(df.Origin, sort=True).to_frame().reset_index()
dforig.columns = ['Origin','Count']
topdforig = dforig.head(n=10)
topdforig.set_index('Origin',inplace = True)
In [26]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
topair = sns.barplot(x = topdforig.index, y = topdforig.Count)
topair.set(xlabel='Airports', ylabel='Number of Flights')
topair.set_title('Number of Flights in Top 10 Airports');
In [27]:
dftop10 = df[df['Origin'].isin(topdforig.index)]
In [28]:
dfcar = pd.value_counts(dftop10.UniqueCarrier, sort=True).to_frame().reset_index()
dfcar.columns = ['Carrier','Count']
topdfcar = dfcar.head(n=10)
topdfcar.set_index('Carrier',inplace = True)
In [29]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
topcar = sns.barplot(x = topdfcar.index, y = topdfcar.Count)
topcar.set(xlabel='Carriers', ylabel='Number of Flights')
topcar.set_title('Number of Flights of Top 10 Carriers at Top 10 Airports');
In [30]:
delay_type_sum = dftop10[['UniqueCarrier', 'ArrDelay', 'DepDelay', 'CarrierDelay', 'SecurityDelay','LateAircraftDelay', 'WeatherDelay']].groupby(dftop10.UniqueCarrier).sum().dropna().head(n=20)
flights = dftop10.groupby('UniqueCarrier').count()
flights = flights[flights.index.isin(delay_type_sum.index)]
flights['Number_Flights'] = flights.Year
flights = flights[[ 'Number_Flights']]
flights.dropna(inplace = True)
delay_per_flight = delay_type_sum[['ArrDelay', 'DepDelay', 'CarrierDelay', 'SecurityDelay','LateAircraftDelay', 'WeatherDelay']].div(flights.Number_Flights, axis='index').sort_values(['DepDelay'], ascending=False).head(n=10)
In [31]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.CarrierDelay)
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.LateAircraftDelay)
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.WeatherDelay)
plt.xticks(range(len(delay_per_flight.index)),delay_per_flight.index)
plt.legend(['CarrierDelay','LateAircraftDelay', 'WeatherDelay'], loc='upper right')
fig.suptitle('Average Minutes Per Flight Delay Per Carrier', fontsize=20)
plt.xlabel('Carrier', fontsize=18)
plt.ylabel('Minutes per Flight', fontsize=16)
plt.show()
In [32]:
dfWN = dftop10[dftop10['UniqueCarrier']=='WN']
In [33]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
delayed_per_month = dfWN[['isDelArr', 'Month']].groupby('Month').sum()
delayed_ratio_pm = delayed_per_month / dfWN[['isDelArr', 'Month']].groupby('Month').count()
dpm = sns.barplot(x = delayed_per_month.index, y = delayed_ratio_pm.isDelArr * 100);
dpm.set(xlabel='Month', ylabel='Percent Delayed')
dpm.set_title('Percent of South West Flights Arrival Delay Per Month');
In [34]:
delayed_per_day = dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').sum()
delayed_ratio_pd = delayed_per_day / dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').count()
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
dpm = sns.barplot(x = delayed_per_day.index, y = delayed_ratio_pd.isDelArr);
dpm.set(xlabel='Day', ylabel='Percent Delayed')
dpm.set_title('Percent of South West Flights Arrival Delay Per Week Day');
In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc
dfWN = dfWN[dfWN.Cancelled != 1]
dfWN = dfWN[dfWN.Diverted != 1]
In [46]:
#defining our target variable
y = dfWN.isDelArr
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum','Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X= pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [47]:
clf = LogisticRegression(class_weight = {1:0.19, 0:0.81})
clf = clf.fit(X_train, y_train)
In [48]:
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [49]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic Logistic Regression Arrival Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [50]:
y = dfWN.isDelDep
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum','Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X= pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [51]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic for Logistic Regression Departure Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [52]:
clf = RandomForestClassifier(max_features = None, oob_score=True)
clf = clf.fit(X_train, y_train)
In [53]:
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [54]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic for Random Forest Arrival Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [55]:
y = dfWN.isDelDep
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum','Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X= pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier(max_features = None, oob_score=True)
clf = clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [56]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic for Random Forest Departure Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [57]:
y = dfWN.isDelArr
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum','Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X= pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
beta_1=0.9, beta_2=0.999, early_stopping=False,
epsilon=1e-08, hidden_layer_sizes=(10, 5), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
clf = clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [58]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic for NN Arrival Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [59]:
y = dfWN.isDelDep
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum','Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X= pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
beta_1=0.9, beta_2=0.999, early_stopping=False,
epsilon=1e-08, hidden_layer_sizes=(10, 5), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
clf = clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
fptp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = metrics.auc(fpr,tpr)
In [60]:
fig=plt.figure(figsize=(5, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.title('Receiver Operating Characteristic for NN Departure Delay')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Although we dominate the national market we are still a small player in the top 10 most used airports. That means we have a lot of opportunity for growth. One avenue of that can be reducing delays to increase customer satisfaction and reduce costs. Predicting if a flight will be delayed will play a crucial part in that.
In [61]:
fig=plt.figure(figsize=(10, 5), dpi= 80, facecolor='w', edgecolor='k');
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.CarrierDelay)
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.LateAircraftDelay)
plt.plot(range(len(delay_per_flight.index)), delay_per_flight.WeatherDelay)
plt.xticks(range(len(delay_per_flight.index)),delay_per_flight.index)
plt.legend(['CarrierDelay','LateAircraftDelay', 'WeatherDelay'], loc='upper right')
fig.suptitle('Average Minutes Per Flight Delay Per Carrier', fontsize=20)
plt.xlabel('Carrier', fontsize=18)
plt.ylabel('Minutes per Flight', fontsize=16)
plt.show()
In [62]:
conn.close()