Use this dataset of airline arrival information to predict how late flights will be. A flight only counts as late if it is more than 30 minutes late.
In [37]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd
from sklearn import ensemble
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
In [38]:
# Read and import data
airlines = pd.read_csv('Airlines 2008.csv')
airlines.head()
Out[38]:
In [39]:
airlines.fillna(method='bfill')
Out[39]:
In [40]:
#airlines.loc[airlines['ArrDelay'] <= 30, 'ArrDelay'] = 0
#airlines.loc[airlines['ArrDelay'] > 30, 'ArrDelay'] = airlines['ArrDelay'] - 30
In [41]:
len(airlines)
Out[41]:
In [42]:
airlines.info()
In [44]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'DayofMonth']].groupby('DayofMonth').mean()
# plot average delays by month
grouped.plot(kind='bar')
Out[44]:
In [45]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'DayOfWeek']].groupby('DayOfWeek').mean()
# plot average delays by month
grouped.plot(kind='bar')
Out[45]:
In [46]:
# Compute average number of delayed flights per month
airlines['hour'] = airlines['CRSArrTime'].map(lambda x: int(str(int(x)).zfill(4)[:2]))
grouped = airlines[['ArrDelay', 'hour' ]].groupby('hour').mean()
# plot average delays by month
grouped.plot(kind='bar')
Out[46]:
In [47]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'Month']].groupby('Month').mean()
# plot average delays by month
grouped.plot(kind='bar')
Out[47]:
In [48]:
# Compute average number of delayed flights per month
grouped = airlines[['ArrDelay', 'Month']].groupby('Month').mean()
# plot average delays by month
grouped.plot(kind='bar')
Out[48]:
In [49]:
airlines.isnull().sum()
Out[49]:
In [52]:
#Drop columns that are not going to be used
airlines1 = airlines.drop(airlines[['Year','Month','UniqueCarrier','FlightNum',
'TailNum','Origin','Dest',
'CancellationCode',
'CarrierDelay',
'WeatherDelay',
'NASDelay',
'SecurityDelay',
'LateAircraftDelay']],axis=1)
In [ ]:
airlines2=airlines1.drop(airlines1[['DepTime','ActualElapsedTime',
'CRSElapsedTime','AirTime',
'DepDelay','TaxiIn','TaxiOut','AirTime','Cancelled','Diverted'
]],axis=1)
airlines2.info()
In [ ]:
airlines2.isnull().sum()
In [ ]:
airlines3 = airlines2.dropna(how='any')
airlines3.isnull().sum()
In [ ]:
print(airlines3[airlines3['ArrDelay'] > 30].count())
print(airlines3.count())
In [ ]:
#Define Outcome & Predictors
y = airlines3['ArrDelay'] > 30
X = airlines3
#Scale the data
names = X.columns
X = pd.DataFrame(preprocessing.scale(X), columns = names)
#Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#KFOld
kf = KFold(20)
In [ ]:
# Initialize and fit the model.
lr = LogisticRegression()
fittrain = lr.fit(X_train,y_train)
fittest = lr.fit(X_test,y_test)
# Predict on training set
predtrain_y = lr.predict(X_train)
predtest_y = lr.predict(X_test)
In [ ]:
print(fittrain.coef_)
print(fittrain.intercept_)
cross_val_score(lr, X_train, y_train, cv=kf).mean()
In [ ]:
cross_val_score(lr, X_train, y_train, cv=kf).mean()
In [ ]: