In [8]:
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import matplotlib.gridspec as gridspec
import scipy as sp
%matplotlib inline
This notebook will be describing an analysis of US domestic flight data. We will first take a dive into a single year, 2008 and do some basic analysis. Then we may expand this into multiple years to try and see trends. Finally we will use a single year to build a predictive model to predict weather or not a flight will be delayed or not.
In [12]:
df_flights_2008 = pd.read_csv('/home/hakim/Documents/Airline_delays-/flight_data_historical/2008.csv')
print(df_flights_2008.tail())
In [13]:
df_flights_2008.shape
Out[13]:
In [14]:
df_flights_2008.isnull().sum(axis=0)
Out[14]:
Next thing we would like to do is remove cancelled flights. ALthough they can be interesting for other analysis for ours they are not needed. As a side note only 0.02% of flights were cancelled
In [15]:
df = df_flights_2008
df = df[df.Cancelled != 1]
We will also remove diverted flights. For this analysis we would like to only focus on delays.
In [16]:
df = df[df.Diverted != 1]
df.isnull().sum(axis=0)
Out[16]:
What we have left are all non-cancelled diverted flights. Therefore we can remove the columns related to that. Also as we can see by looking at the numbers about 76% percent of flights were on time. That is not great.
In [17]:
df = df.drop(['Cancelled', 'Diverted', 'CancellationCode'], axis = 1)
If a flight is more than 10 minutes late for leaving or arriving I have marked it with a boolean variable below.
In [43]:
conditionsArr = [(df['ArrDelay'] > 10)]
choicesArr = [1]
df['isDelArr'] = np.select(conditionsArr, choicesArr, default = 0)
conditionsDep = [(df['DepDelay'] > 10)]
choicesDep = [1]
df['isDelDep'] = np.select(conditionsDep, choicesDep, default = 0)
In [58]:
ts = df[['isDelArr', 'isDelDep']].sum().to_frame()
ts.columns = ['Count']
sns.barplot(x = ts.index , y = ts.Count);
Now we will only focus on the top 10 airports. They most likely generate the most revenue.
In [59]:
dforig = pd.value_counts(df.Origin, sort=True).to_frame().reset_index()
dforig.columns = ['Origin','Count']
topdforig = dforig.head(n=10)
topdforig.set_index('Origin',inplace = True)
df = df[df['Origin'].isin(topdforig.index)]
grouped = df.groupby('Origin').sum()
grouped = grouped[['isDelArr','isDelDep']]
grouped['DelArrRa'] = grouped.isDelArr
grouped['DelDepRa'] = grouped.isDelDep
gs = gridspec.GridSpec(3, 1)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
sns.barplot(x = grouped.index, y = grouped.DelArrRa, ax = ax1);
sns.barplot(x = grouped.index, y = grouped.DelDepRa, ax = ax2);
sns.barplot(x = topdforig.index, y = topdforig.Count, ax = ax3);
In [21]:
pd.options.display.float_format = '{:.2f}'.format
df[['ActualElapsedTime', 'ArrDelay', 'DepDelay', 'Distance']].describe()
Out[21]:
In [13]:
gs = gridspec.GridSpec(2, 2)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax4 = plt.subplot(gs[3])
sns.distplot(df.ActualElapsedTime, ax = ax1);
sns.distplot(df.ArrDelay, ax = ax2);
sns.distplot(df.DepDelay, ax = ax3);
sns.distplot(df.Distance, ax = ax4);
In [14]:
dforig = pd.value_counts(df.Origin, sort=True).to_frame().reset_index()
dforig.columns = ['Origin','Count']
topdforig = dforig.head(n=10)
topdforig.set_index('Origin',inplace = True)
df = df[df['Origin'].isin(topdforig.index)]
dfa = pd.value_counts(df.UniqueCarrier, sort=True).to_frame().reset_index().head(n=10)
dfa.columns = ['Carrier','Count']
dfa.set_index('Carrier', inplace = True)
df = df[df['UniqueCarrier'].isin(dfa.index)]
delay_type_sum = df[['UniqueCarrier','CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].groupby(df.UniqueCarrier).sum().dropna()
flights = df.groupby('UniqueCarrier').count()
flights = flights[flights.index.isin(delay_type_sum.index)]
flights['Number_Flights'] = flights.Year
flights = flights[[ 'Number_Flights']]
flights.dropna(inplace = True)
delay_per_flight = delay_type_sum[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].div(flights.Number_Flights, axis='index')
delay_per_flight.describe()
Out[14]:
In [15]:
gs = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
ax3 = plt.subplot(gs[2])
ax4 = plt.subplot(gs[3])
ax5 = plt.subplot(gs[4])
sns.pointplot(x= delay_per_flight.index, y='CarrierDelay', data=delay_per_flight, ax = ax1)
sns.pointplot(x= delay_per_flight.index, y='WeatherDelay', data=delay_per_flight, ax = ax2)
sns.pointplot(x= delay_per_flight.index, y='NASDelay', data=delay_per_flight, ax = ax3)
sns.pointplot(x= delay_per_flight.index, y='SecurityDelay', data=delay_per_flight, ax = ax4);
sns.pointplot(x= delay_per_flight.index, y='LateAircraftDelay', data=delay_per_flight, ax = ax5);
In [61]:
dfWN = df[df['UniqueCarrier']=='WN']
delayed_per_month = dfWN[['isDelArr', 'Month']].groupby('Month').sum()
delayed_ratio_pm = delayed_per_month / dfWN[['isDelArr', 'Month']].groupby('Month').count()
sns.barplot(x = delayed_per_month.index, y = delayed_ratio_pm.isDelArr);
In [64]:
delayed_per_day = dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').sum()
delayed_ratio_pd = delayed_per_day / dfWN[['isDelArr', 'DayOfWeek']].groupby('DayOfWeek').count()
sns.barplot(x = delayed_per_day.index, y = delayed_ratio_pd.isDelArr);
Now we will start building a predictive model to predict if a plane will be delayed or not. We will start with a simple logistic regression. If it is not good enough we will move onto more complicated algorithms like SVM.
In [184]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
In [182]:
#defining our target variable
y = dfWN.isDelArr
X = dfWN[['Month','DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'TailNum', 'ActualElapsedTime', 'Origin', 'Dest', 'Distance']]
for col in ['Month','DayofMonth', 'DayOfWeek']:
X[col] = X[col].astype('category')
X_dum = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
In [68]:
model = LogisticRegression()
model = model.fit(X_train, y_train)
# check the accuracy on the training set
print(model.score(X_train, y_train),
model.score(X_test, y_test))
The test and training errors are very similar this means the model is most likely bias. For this model since 30% of flights are normally delayed the accuracy is really only 4% better than random guessing. That is not very good. We need to play around a bit. We will try some transformations next
In [26]:
X_dum[['DepTime_2', 'ArrTime_2', 'ActualElapsedTime_2','Distance_2']] = np.log(X_dum[['DepTime', 'ArrTime', 'ActualElapsedTime','Distance']])
In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.3)
#model = LogisticRegression()
#model = model.fit(X_train, y_train)
# check the accuracy on the training set
#print(model.score(X_train, y_train),
#model.score(X_test, y_test))
In [ ]:
clf = svm.SVC(kernel='linear', C=1, random_state=0)
clf = clf.fit(X_train.head(n=10000), y_train.head(n=10000))
clf.score(X_test, y_test)
Transformations did not lead to better results so we will move on to a more complicated model like SVM.
In [69]:
clf = RandomForestClassifier(n_jobs=2, random_state=0, max_features = None, oob_score=True)
clf = clf.fit(X_train, y_train)
print(clf.oob_score_,clf.score(X_test, y_test))
In [192]:
clf = MLPClassifier(activation='tanh', alpha=1e-05, batch_size='auto',
beta_1=0.9, beta_2=0.999, early_stopping=False,
epsilon=1e-08, hidden_layer_sizes=(100, 4), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
clf = clf.fit(X_train, y_train)
clf.score(X_test,y_test)
Out[192]:
In [146]:
import sqlite3
conn = sqlite3.connect("/home/hakim/Documents/Airline_delays-/flight_data_historical/delayed.sqlite3")
In [120]:
flightcount = pd.read_sql_query("select distinct Year ,count(Origin) as number_of_flights from delayed where Cancelled != 1 group by Year;", conn)
flightcount.set_index('Year', inplace = True)
In [124]:
fc = sns.pointplot(x = flightcount.index, y = flightcount.number_of_flights)
fc.set_xticklabels(flightcount.index, rotation=90);
In [128]:
flightscancelled = pd.read_sql_query("select distinct Year ,sum(Cancelled) as canceled_flights from delayed group by Year;", conn)
flightscancelled.set_index('Year', inplace = True)
In [ ]:
delflcount = pd.read_sql_query("select distinct Year ,count(Origin) as number_of_flights from delayed where Cancelled != 1 group by Year;", conn)
In [134]:
fcl = sns.pointplot(x = flightscancelled.index, y = (flightscancelled.canceled_flights /flightcount.number_of_flights) * 100)
fcl.set_xticklabels(flightscancelled.index, rotation=90);
In [136]:
flightsdiverted = pd.read_sql_query("select distinct Year ,sum(Diverted) as Diverted from delayed group by Year;", conn)
flightsdiverted.set_index('Year', inplace = True)
In [140]:
dd = sns.pointplot(x = flightsdiverted.index, y = flightsdiverted.Diverted /flightcount.number_of_flights)
dd.set_xticklabels(flightsdiverted.index, rotation=90);
In [153]:
delayarr = pd.read_sql_query("select distinct year,sum(ArrDelay) as ArrDelay from delayed where ArrDelay >= 30 group by year;", conn)
delayarr.set_index('Year', inplace = True)
In [157]:
Out[157]:
In [154]:
da = sns.pointplot(x = delayarr.index, y = delayarr.ArrDelay /flightcount.number_of_flights)
da.set_xticklabels(delayarr.index, rotation=90);
In [102]:
delaytrendweather = pd.read_sql_query("select distinct year,sum(WeatherDelay) as weatherdelay from delayed group by year;", conn)
delaytrendweather.set_index('Year', inplace = True)
In [145]:
dw = sns.pointplot(x = delaytrendweather.index, y = delaytrendweather.weatherdelay)
dw.set_xticklabels(delaytrendweather.index, rotation=90);
In [133]:
delaytrendsecuirty = pd.read_sql_query("select sum(SecurityDelay) from delayed group by year;", conn)
In [144]:
delaytrendsecuirty.columns=['SecurityDelay']
sns.pointplot(x = delaytrendsecuirty.index, y = delaytrendsecuirty.SecurityDelay);
In [141]:
delaytrendlateflight= pd.read_sql_query("select sum(LateAircraftDelay) from delayed group by year;", conn)
In [148]:
delaytrendlateflight.columns =['LateAircraftDelay']
sns.pointplot(x = delaytrendlateflight.index, y = delaytrendlateflight.LateAircraftDelay);
In [179]:
topar = pd.read_sql_query("select Year, Origin, (select count(Origin) as count_origin from (select count(Origin) from delayed as b where b.Year = a.Year limit 1) ) as count_origin from delayed as a group by Year;", conn)
In [ ]: