Use this dataset of airline arrival information to predict how late flights will be. A flight only counts as late if it is more than 30 minutes late.


In [37]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn import ensemble
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [38]:
# Read and import data
airlines = pd.read_csv('Airlines 2008.csv')
airlines.head()


Out[38]:
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum ... TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
0 2008 1 3 4 2003.0 1955 2211.0 2225 WN 335 ... 4.0 8.0 0 NaN 0 NaN NaN NaN NaN NaN
1 2008 1 3 4 754.0 735 1002.0 1000 WN 3231 ... 5.0 10.0 0 NaN 0 NaN NaN NaN NaN NaN
2 2008 1 3 4 628.0 620 804.0 750 WN 448 ... 3.0 17.0 0 NaN 0 NaN NaN NaN NaN NaN
3 2008 1 3 4 926.0 930 1054.0 1100 WN 1746 ... 3.0 7.0 0 NaN 0 NaN NaN NaN NaN NaN
4 2008 1 3 4 1829.0 1755 1959.0 1925 WN 3920 ... 3.0 10.0 0 NaN 0 2.0 0.0 0.0 0.0 32.0

5 rows × 29 columns


In [39]:
airlines.fillna(method='bfill')


Out[39]:
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum ... TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
0 2008 1 3 4 2003.0 1955 2211.0 2225 WN 335 ... 4.0 8.0 0 A 0 2.0 0.0 0.0 0.0 32.0
1 2008 1 3 4 754.0 735 1002.0 1000 WN 3231 ... 5.0 10.0 0 A 0 2.0 0.0 0.0 0.0 32.0
2 2008 1 3 4 628.0 620 804.0 750 WN 448 ... 3.0 17.0 0 A 0 2.0 0.0 0.0 0.0 32.0
3 2008 1 3 4 926.0 930 1054.0 1100 WN 1746 ... 3.0 7.0 0 A 0 2.0 0.0 0.0 0.0 32.0
4 2008 1 3 4 1829.0 1755 1959.0 1925 WN 3920 ... 3.0 10.0 0 A 0 2.0 0.0 0.0 0.0 32.0
5 2008 1 3 4 1940.0 1915 2121.0 2110 WN 378 ... 4.0 10.0 0 A 0 10.0 0.0 0.0 0.0 47.0
6 2008 1 3 4 1937.0 1830 2037.0 1940 WN 509 ... 3.0 7.0 0 A 0 10.0 0.0 0.0 0.0 47.0
7 2008 1 3 4 1039.0 1040 1132.0 1150 WN 535 ... 7.0 7.0 0 A 0 8.0 0.0 0.0 0.0 72.0
8 2008 1 3 4 617.0 615 652.0 650 WN 11 ... 6.0 19.0 0 A 0 8.0 0.0 0.0 0.0 72.0
9 2008 1 3 4 1620.0 1620 1639.0 1655 WN 810 ... 3.0 6.0 0 A 0 8.0 0.0 0.0 0.0 72.0
10 2008 1 3 4 706.0 700 916.0 915 WN 100 ... 5.0 19.0 0 A 0 8.0 0.0 0.0 0.0 72.0
11 2008 1 3 4 1644.0 1510 1845.0 1725 WN 1333 ... 6.0 8.0 0 A 0 8.0 0.0 0.0 0.0 72.0
12 2008 1 3 4 1426.0 1430 1426.0 1425 WN 829 ... 9.0 12.0 0 A 0 3.0 0.0 0.0 0.0 12.0
13 2008 1 3 4 715.0 715 720.0 710 WN 1016 ... 7.0 21.0 0 A 0 3.0 0.0 0.0 0.0 12.0
14 2008 1 3 4 1702.0 1700 1651.0 1655 WN 1827 ... 4.0 10.0 0 A 0 3.0 0.0 0.0 0.0 12.0
15 2008 1 3 4 1029.0 1020 1021.0 1010 WN 2272 ... 6.0 9.0 0 A 0 3.0 0.0 0.0 0.0 12.0
16 2008 1 3 4 1452.0 1425 1640.0 1625 WN 675 ... 7.0 8.0 0 A 0 3.0 0.0 0.0 0.0 12.0
17 2008 1 3 4 754.0 745 940.0 955 WN 1144 ... 5.0 16.0 0 A 0 0.0 0.0 0.0 0.0 16.0
18 2008 1 3 4 1323.0 1255 1526.0 1510 WN 4 ... 4.0 9.0 0 A 0 0.0 0.0 0.0 0.0 16.0
19 2008 1 3 4 1416.0 1325 1512.0 1435 WN 54 ... 2.0 5.0 0 A 0 12.0 0.0 0.0 0.0 25.0
20 2008 1 3 4 706.0 705 807.0 810 WN 68 ... 3.0 7.0 0 A 0 7.0 0.0 0.0 0.0 12.0
21 2008 1 3 4 1657.0 1625 1754.0 1735 WN 623 ... 5.0 5.0 0 A 0 7.0 0.0 0.0 0.0 12.0
22 2008 1 3 4 1900.0 1840 1956.0 1950 WN 717 ... 2.0 5.0 0 A 0 40.0 0.0 0.0 0.0 7.0
23 2008 1 3 4 1039.0 1030 1133.0 1140 WN 1244 ... 2.0 5.0 0 A 0 40.0 0.0 0.0 0.0 7.0
24 2008 1 3 4 801.0 800 902.0 910 WN 2101 ... 3.0 5.0 0 A 0 40.0 0.0 0.0 0.0 7.0
25 2008 1 3 4 1520.0 1455 1619.0 1605 WN 2553 ... 2.0 7.0 0 A 0 40.0 0.0 0.0 0.0 7.0
26 2008 1 3 4 1422.0 1255 1657.0 1610 WN 188 ... 6.0 6.0 0 A 0 40.0 0.0 0.0 0.0 7.0
27 2008 1 3 4 1954.0 1925 2239.0 2235 WN 1754 ... 3.0 7.0 0 A 0 5.0 0.0 0.0 0.0 59.0
28 2008 1 3 4 636.0 635 921.0 945 WN 2275 ... 5.0 13.0 0 A 0 5.0 0.0 0.0 0.0 59.0
29 2008 1 3 4 734.0 730 958.0 1020 WN 550 ... 2.0 8.0 0 A 0 5.0 0.0 0.0 0.0 59.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7009698 2008 12 13 6 1625.0 1635 1744.0 1758 DL 1605 ... 12.0 21.0 0 NaN 0 0.0 0.0 0.0 0.0 21.0
7009699 2008 12 13 6 1254.0 1221 1420.0 1359 DL 1609 ... 9.0 13.0 0 NaN 0 0.0 0.0 0.0 0.0 21.0
7009700 2008 12 13 6 1842.0 1845 1953.0 2006 DL 1610 ... 8.0 11.0 0 NaN 0 16.0 0.0 10.0 0.0 12.0
7009701 2008 12 13 6 1528.0 1500 1720.0 1642 DL 1611 ... 4.0 29.0 0 NaN 0 16.0 0.0 10.0 0.0 12.0
7009702 2008 12 13 6 1531.0 1522 1822.0 1823 DL 1612 ... 9.0 14.0 0 NaN 0 51.0 0.0 13.0 0.0 0.0
7009703 2008 12 13 6 1910.0 1910 2017.0 2016 DL 1612 ... 5.0 24.0 0 NaN 0 51.0 0.0 13.0 0.0 0.0
7009704 2008 12 13 6 1441.0 1445 1604.0 1622 DL 1613 ... 8.0 10.0 0 NaN 0 51.0 0.0 13.0 0.0 0.0
7009705 2008 12 13 6 921.0 830 1112.0 1008 DL 1616 ... 8.0 21.0 0 NaN 0 51.0 0.0 13.0 0.0 0.0
7009706 2008 12 13 6 1435.0 1440 1701.0 1704 DL 1618 ... 20.0 10.0 0 NaN 0 0.0 0.0 0.0 0.0 17.0
7009707 2008 12 13 6 1750.0 1755 2010.0 2015 DL 1618 ... 7.0 20.0 0 NaN 0 0.0 0.0 0.0 0.0 17.0
7009708 2008 12 13 6 706.0 710 850.0 837 DL 1619 ... 23.0 32.0 0 NaN 0 0.0 0.0 0.0 0.0 17.0
7009709 2008 12 13 6 1552.0 1520 1735.0 1718 DL 1620 ... 9.0 7.0 0 NaN 0 0.0 0.0 0.0 0.0 17.0
7009710 2008 12 13 6 1250.0 1220 1617.0 1552 DL 1621 ... 9.0 18.0 0 NaN 0 3.0 0.0 0.0 0.0 22.0
7009711 2008 12 13 6 1033.0 1041 1255.0 1303 DL 1622 ... 9.0 15.0 0 NaN 0 0.0 0.0 19.0 0.0 0.0
7009712 2008 12 13 6 840.0 843 1025.0 1021 DL 1624 ... 6.0 46.0 0 NaN 0 0.0 0.0 19.0 0.0 0.0
7009713 2008 12 13 6 810.0 815 1504.0 1526 DL 1625 ... 7.0 17.0 0 NaN 0 0.0 0.0 19.0 0.0 0.0
7009714 2008 12 13 6 547.0 545 646.0 650 DL 1627 ... 8.0 13.0 0 NaN 0 0.0 0.0 19.0 0.0 0.0
7009715 2008 12 13 6 848.0 850 1024.0 1005 DL 1628 ... 4.0 44.0 0 NaN 0 0.0 0.0 19.0 0.0 0.0
7009716 2008 12 13 6 936.0 936 1114.0 1119 DL 1630 ... 4.0 24.0 0 NaN 0 0.0 57.0 18.0 0.0 0.0
7009717 2008 12 13 6 657.0 600 904.0 749 DL 1631 ... 15.0 34.0 0 NaN 0 0.0 57.0 18.0 0.0 0.0
7009718 2008 12 13 6 1007.0 847 1149.0 1010 DL 1631 ... 8.0 32.0 0 NaN 0 1.0 0.0 19.0 0.0 79.0
7009719 2008 12 13 6 638.0 640 808.0 753 DL 1632 ... 14.0 26.0 0 NaN 0 0.0 0.0 15.0 0.0 0.0
7009720 2008 12 13 6 756.0 800 1032.0 1026 DL 1633 ... 23.0 17.0 0 NaN 0 0.0 0.0 16.0 0.0 0.0
7009721 2008 12 13 6 612.0 615 923.0 907 DL 1635 ... 5.0 23.0 0 NaN 0 0.0 0.0 16.0 0.0 0.0
7009722 2008 12 13 6 749.0 750 901.0 859 DL 1636 ... 20.0 11.0 0 NaN 0 NaN NaN NaN NaN NaN
7009723 2008 12 13 6 1002.0 959 1204.0 1150 DL 1636 ... 6.0 45.0 0 NaN 0 NaN NaN NaN NaN NaN
7009724 2008 12 13 6 834.0 835 1021.0 1023 DL 1637 ... 5.0 23.0 0 NaN 0 NaN NaN NaN NaN NaN
7009725 2008 12 13 6 655.0 700 856.0 856 DL 1638 ... 24.0 12.0 0 NaN 0 NaN NaN NaN NaN NaN
7009726 2008 12 13 6 1251.0 1240 1446.0 1437 DL 1639 ... 13.0 13.0 0 NaN 0 NaN NaN NaN NaN NaN
7009727 2008 12 13 6 1110.0 1103 1413.0 1418 DL 1641 ... 8.0 11.0 0 NaN 0 NaN NaN NaN NaN NaN

7009728 rows × 29 columns


In [40]:
#airlines.loc[airlines['ArrDelay'] <= 30, 'ArrDelay'] = 0
#airlines.loc[airlines['ArrDelay'] > 30, 'ArrDelay'] = airlines['ArrDelay'] - 30

In [41]:
len(airlines)


Out[41]:
7009728

In [42]:
airlines.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009728 entries, 0 to 7009727
Data columns (total 29 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtypes: float64(14), int64(10), object(5)
memory usage: 1.5+ GB

In [44]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'DayofMonth']].groupby('DayofMonth').mean()

# plot average delays by month
grouped.plot(kind='bar')


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x163fb45e198>

In [45]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'DayOfWeek']].groupby('DayOfWeek').mean()

# plot average delays by month
grouped.plot(kind='bar')


Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x163fb45e4a8>

In [46]:
# Compute average number of delayed flights per month
airlines['hour'] = airlines['CRSArrTime'].map(lambda x: int(str(int(x)).zfill(4)[:2]))
grouped = airlines[['ArrDelay', 'hour' ]].groupby('hour').mean()

# plot average delays by month
grouped.plot(kind='bar')


Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x1645180f160>

In [47]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'Month']].groupby('Month').mean()

# plot average delays by month
grouped.plot(kind='bar')


Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x165d138d5f8>

In [48]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'Month']].groupby('Month').mean()

# plot average delays by month
grouped.plot(kind='bar')


Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x16451819e80>

In [49]:
airlines.isnull().sum()


Out[49]:
Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               136246
CRSDepTime                 0
ArrTime               151649
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                83365
ActualElapsedTime     154699
CRSElapsedTime           844
AirTime               154699
ArrDelay              154699
DepDelay              136246
Origin                     0
Dest                       0
Distance                   0
TaxiIn                151649
TaxiOut               137058
Cancelled                  0
CancellationCode     6872294
Diverted                   0
CarrierDelay         5484993
WeatherDelay         5484993
NASDelay             5484993
SecurityDelay        5484993
LateAircraftDelay    5484993
hour                       0
dtype: int64

In [52]:
#Drop columns that are not going to be used
airlines1 = airlines.drop(airlines[['Year','Month','UniqueCarrier','FlightNum',
                                    'TailNum','Origin','Dest',
                                    'CancellationCode',
                                   'CarrierDelay',
                                   'WeatherDelay',
                                   'NASDelay',
                                   'SecurityDelay',
                                   'LateAircraftDelay']],axis=1)

In [ ]:
airlines2=airlines1.drop(airlines1[['DepTime','ActualElapsedTime',
                                    'CRSElapsedTime','AirTime',
                                    'DepDelay','TaxiIn','TaxiOut','AirTime','Cancelled','Diverted'
                                    ]],axis=1)

airlines2.info()

In [ ]:
airlines2.isnull().sum()

In [ ]:
airlines3 = airlines2.dropna(how='any') 
airlines3.isnull().sum()

In [ ]:
print(airlines3[airlines3['ArrDelay'] > 30].count())
print(airlines3.count())

In [ ]:
#Define Outcome & Predictors

y = airlines3['ArrDelay'] > 30
X = airlines3

#Scale the data
names = X.columns
X = pd.DataFrame(preprocessing.scale(X), columns = names)

#Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#KFOld
kf = KFold(20)

In [ ]:
# Initialize and fit the model.
lr = LogisticRegression()
fittrain = lr.fit(X_train,y_train)
fittest = lr.fit(X_test,y_test)

# Predict on training set
predtrain_y = lr.predict(X_train)
predtest_y = lr.predict(X_test)

In [ ]:
print(fittrain.coef_)
print(fittrain.intercept_)
cross_val_score(lr, X_train, y_train, cv=kf).mean()

In [ ]:
cross_val_score(lr, X_train, y_train, cv=kf).mean()

In [ ]: