In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('unzipped_data/On_Time_On_Time_Performance_2016_8.csv')
In [5]:
list(df.columns)
Out[5]:
In [ ]:
In [8]:
df.shape
Out[8]:
In [6]:
len(set(df.Origin))
Out[6]:
In [19]:
df.FlightDate.min()
Out[19]:
In [20]:
df.FlightDate.max()
Out[20]:
In [21]:
df.DepTime.count()
df.DepTime.dropna().describe()
Out[21]:
In [11]:
needed_columns = ['Year',
'Quarter',
'Month',
'DayofMonth',
'DayOfWeek',
'FlightDate',
'UniqueCarrier',
'Origin',
'OriginCityName',
'Dest',
'DestCityName',
'CRSDepTime',
'DepTime',
'DepDelay',
'DepDelayMinutes',
'DepDel15',
'DepartureDelayGroups',
'DepTimeBlk',
'CRSArrTime',
'ArrTime',
'ArrDelay',
'ArrDelayMinutes',
'ArrDel15',
'ArrTimeBlk',
'Cancelled',
'Diverted',
'CRSElapsedTime',
'ActualElapsedTime',
'Distance',
'DistanceGroup',
]
In [10]:
# Percentage of flights. 1=Monday, 2=Tuesday, etc.
df[['DayOfWeek', 'DepDel15']].groupby('DayOfWeek').mean()
Out[10]:
In [195]:
df[['UniqueCarrier', 'DepDelayMinutes']].groupby('UniqueCarrier').count()
Out[195]:
In [11]:
# Mean minutes of delay by carrier
df[['UniqueCarrier', 'DepDelayMinutes']].groupby('UniqueCarrier').mean()
Out[11]:
In [ ]:
CARRIERS = {
'AA': 'American',
'AS': 'Alaska',
'B6': 'Jet Blue',
'DL': 'Delta',
'EV': 'Express Jet',
'F9': 'Frontier',
'HA': 'Hawaiian',
'NK': 'Spirit',
'OO': 'SkyWest',
'UA': 'United',
'VX': 'Virgin',
'WN': 'Southwest'
}
In [91]:
df[['Origin', 'DepDel15']].groupby('Origin').mean()
Out[91]:
In [ ]:
In [90]:
# Percent of flights arriving within 15 minute of time by origin
mean_dep_delay15 = df[['Origin', 'DepDel15']].groupby('Origin').mean()
In [17]:
len(set(df.Origin))
Out[17]:
In [23]:
for x in df.Origin:
break
In [26]:
import numpy as np
Features:
Objective function:
In [210]:
match = [x==y for (x,y) in zip((df.DepDelayMinutes >= 15), df.DepDel15)]
In [67]:
quantiles = [0] + list(np.percentile(mean_dep_delay15, [20,40,60,80])) + [1.1]
In [68]:
quantiles
Out[68]:
In [93]:
origin_groups = []
for (low, high) in list(zip(quantiles, quantiles[1:])):
origin_groups.append(
set(mean_dep_delay15[(mean_dep_delay15 >= low) & (mean_dep_delay15 < high)].dropna().index)
)
In [94]:
[len(x) for x in origin_groups]
Out[94]:
In [76]:
for i, group in enumerate(origin_groups):
df['OriginGroup%s' % i] = [int(o in group) for o in df.Origin]
In [87]:
unique_carriers = list(set(df.UniqueCarrier))
for carrier in unique_carriers:
df['Carrier%s' % carrier] = [int(x == carrier) for x in df.UniqueCarrier]
In [97]:
days_of_week = sorted(list(set(df.DayOfWeek)))
for dow in days_of_week:
df['DayOfWeek%s' % dow] = [int(x == dow) for x in df.DayOfWeek]
In [ ]:
In [297]:
clean_df = df[['DepTime', 'DepDelayMinutes']].dropna()
plt.scatter(x=clean_df.DepTime.iloc[:5000], y = clean_df.DepDelayMinutes.iloc[:5000])
Out[297]:
In [99]:
thresholds = [-1, 400, 800, 1200, 1600, 2000, 2401]
In [110]:
buckets = []
for i, (min_time, max_time) in enumerate(list(zip(thresholds, thresholds[1:]))):
df["DepTimeBucket%s" % i] = ((df.DepTime >= min_time) & (df.DepTime < max_time)).astype(int)
In [122]:
features = (['OriginGroup%s' % i for i in range(5)] +
['Carrier%s' % carrier for carrier in unique_carriers] +
['DayOfWeek%s' % dow for dow in days_of_week] +
['DepTimeBucket%s' % i for i in range(6)]
)
In [214]:
from sklearn.linear_model import LogisticRegression, LinearRegression
In [252]:
model = LogisticRegression()
In [253]:
clean_df = df[features + ['DepDelayMinutes']].dropna()
In [264]:
clean_df['Delayed'] = [int(x >= 15) for x in clean_df.DepDelayMinutes]
In [265]:
train_size = int(len(clean_df) * 0.7)
In [266]:
model.fit(clean_df[features].iloc[:train_size], clean_df.Delayed.iloc[:train_size])
Out[266]:
In [267]:
predictions = model.predict(clean_df[features].iloc[train_size:])
In [268]:
actuals = clean_df.Delayed.iloc[train_size:]
In [299]:
origin_groups
Out[299]:
In [ ]:
In [275]:
from sklearn.metrics import roc_curve, auc
In [276]:
predict_probs = [tpl[1] for tpl in model.predict_proba(clean_df[features].iloc[train_size:])]
In [277]:
# Compute micro-average ROC curve and ROC area
fpr, tpr, _ = roc_curve(actuals, predict_probs)
In [278]:
import matplotlib.pyplot as plt
In [ ]:
In [279]:
#Plot of a ROC curve for a specific class
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [292]:
In [280]:
auc(fpr, tpr)
Out[280]:
In [281]:
model.intercept_
Out[281]:
In [282]:
coefs = dict(list(zip(features, model.coef_[0])))
In [ ]:
In [283]:
coefs
Out[283]: