In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('unzipped_data/On_Time_On_Time_Performance_2016_8.csv')


/Users/kevin/anaconda/envs/calhacks/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (77,84) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [5]:
list(df.columns)


Out[5]:
['Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'FlightDate',
 'UniqueCarrier',
 'AirlineID',
 'Carrier',
 'TailNum',
 'FlightNum',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'Origin',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'Dest',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'CRSDepTime',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'DepartureDelayGroups',
 'DepTimeBlk',
 'TaxiOut',
 'WheelsOff',
 'WheelsOn',
 'TaxiIn',
 'CRSArrTime',
 'ArrTime',
 'ArrDelay',
 'ArrDelayMinutes',
 'ArrDel15',
 'ArrivalDelayGroups',
 'ArrTimeBlk',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'CRSElapsedTime',
 'ActualElapsedTime',
 'AirTime',
 'Flights',
 'Distance',
 'DistanceGroup',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'FirstDepTime',
 'TotalAddGTime',
 'LongestAddGTime',
 'DivAirportLandings',
 'DivReachedDest',
 'DivActualElapsedTime',
 'DivArrDelay',
 'DivDistance',
 'Div1Airport',
 'Div1AirportID',
 'Div1AirportSeqID',
 'Div1WheelsOn',
 'Div1TotalGTime',
 'Div1LongestGTime',
 'Div1WheelsOff',
 'Div1TailNum',
 'Div2Airport',
 'Div2AirportID',
 'Div2AirportSeqID',
 'Div2WheelsOn',
 'Div2TotalGTime',
 'Div2LongestGTime',
 'Div2WheelsOff',
 'Div2TailNum',
 'Div3Airport',
 'Div3AirportID',
 'Div3AirportSeqID',
 'Div3WheelsOn',
 'Div3TotalGTime',
 'Div3LongestGTime',
 'Div3WheelsOff',
 'Div3TailNum',
 'Div4Airport',
 'Div4AirportID',
 'Div4AirportSeqID',
 'Div4WheelsOn',
 'Div4TotalGTime',
 'Div4LongestGTime',
 'Div4WheelsOff',
 'Div4TailNum',
 'Div5Airport',
 'Div5AirportID',
 'Div5AirportSeqID',
 'Div5WheelsOn',
 'Div5TotalGTime',
 'Div5LongestGTime',
 'Div5WheelsOff',
 'Div5TailNum',
 'Unnamed: 109']

What questions would you have about this data?


In [ ]:


In [8]:
df.shape


Out[8]:
(498347, 110)

In [6]:
len(set(df.Origin))


Out[6]:
305

In [19]:
df.FlightDate.min()


Out[19]:
'2016-08-01'

In [20]:
df.FlightDate.max()


Out[20]:
'2016-08-31'

In [21]:
df.DepTime.count()
df.DepTime.dropna().describe()


Out[21]:
count    491761.000000
mean       1335.955763
std         508.833662
min           1.000000
25%         916.000000
50%        1329.000000
75%        1748.000000
max        2400.000000
Name: DepTime, dtype: float64

In [11]:
needed_columns = ['Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'FlightDate',
 'UniqueCarrier',
 'Origin',
 'OriginCityName',
 'Dest',
 'DestCityName',
 'CRSDepTime',
 'DepTime',
 'DepDelay',
 'DepDelayMinutes',
 'DepDel15',
 'DepartureDelayGroups',
 'DepTimeBlk',
 'CRSArrTime',
 'ArrTime',
 'ArrDelay',
 'ArrDelayMinutes',
 'ArrDel15',
 'ArrTimeBlk',
 'Cancelled',
 'Diverted',
 'CRSElapsedTime',
 'ActualElapsedTime',
 'Distance',
 'DistanceGroup',
]

In [10]:
# Percentage of flights. 1=Monday, 2=Tuesday, etc.
df[['DayOfWeek', 'DepDel15']].groupby('DayOfWeek').mean()


Out[10]:
DepDel15
DayOfWeek
1 0.236520
2 0.192396
3 0.179082
4 0.213093
5 0.215063
6 0.206667
7 0.202173

In [195]:
df[['UniqueCarrier', 'DepDelayMinutes']].groupby('UniqueCarrier').count()


Out[195]:
DepDelayMinutes
UniqueCarrier
AA 78856
AS 16435
B6 24420
DL 82756
EV 41705
F9 8900
HA 7035
NK 11814
OO 54230
UA 49471
VX 6055
WN 110084

In [11]:
# Mean minutes of delay by carrier
df[['UniqueCarrier', 'DepDelayMinutes']].groupby('UniqueCarrier').mean()


Out[11]:
DepDelayMinutes
UniqueCarrier
AA 17.602427
AS 5.666322
B6 19.117322
DL 18.993922
EV 18.683779
F9 21.333820
HA 3.270505
NK 21.076350
OO 12.512152
UA 16.543146
VX 13.026755
WN 10.888540

In [ ]:
CARRIERS = {
    'AA': 'American',
    'AS': 'Alaska',
    'B6': 'Jet Blue',
    'DL': 'Delta',
    'EV': 'Express Jet',
    'F9': 'Frontier',
    'HA': 'Hawaiian',
    'NK': 'Spirit',
    'OO': 'SkyWest',
    'UA': 'United',
    'VX': 'Virgin',
    'WN': 'Southwest'
}

In [91]:
df[['Origin', 'DepDel15']].groupby('Origin').mean()


Out[91]:
DepDel15
Origin
ABE 0.239316
ABI 0.222222
ABQ 0.166667
ABR 0.016129
ABY 0.149425
ACK 0.197279
ACT 0.247706
ACV 0.345133
ACY 0.278107
ADK 0.250000
ADQ 0.068966
AEX 0.240157
AGS 0.229075
AKN 0.166667
ALB 0.167553
AMA 0.104869
ANC 0.107018
APN 0.172414
ASE 0.130000
ATL 0.214023
ATW 0.084337
AUS 0.187546
AVL 0.246914
AVP 0.172619
AZO 0.113772
BDL 0.157674
BET 0.024096
BFL 0.115385
BGM 0.148148
BGR 0.229508
... ...
SMF 0.169965
SMX 0.354839
SNA 0.170179
SPI 0.238095
SPS 0.176471
SRQ 0.236453
STL 0.200622
STT 0.167920
STX 0.200000
SUN 0.208861
SWF 0.245902
SYR 0.172676
TLH 0.145078
TPA 0.196301
TRI 0.167568
TTN 0.193548
TUL 0.131356
TUS 0.175549
TVC 0.213450
TWF 0.033898
TXK 0.160000
TYS 0.189274
UST 0.166667
VLD 0.181818
VPS 0.202128
WRG 0.196721
WYS 0.122807
XNA 0.133333
YAK 0.210526
YUM 0.107143

305 rows × 1 columns

How would you encode categorical data such a carrier, day of week and origin airport as numerical features?


In [ ]:


In [90]:
# Percent of flights arriving within 15 minute of time by origin
mean_dep_delay15 = df[['Origin', 'DepDel15']].groupby('Origin').mean()

In [17]:
len(set(df.Origin))


Out[17]:
305

In [23]:
for x in df.Origin:
    break

In [26]:
import numpy as np
  • Features:

    • Origin group
    • UniqueCarrier one hot encoding
    • Day of week one hot encoding
    • Time of day bucket
  • Objective function:

    • DepDelay15 (Whether or not the flight will be delayed by 15 minutes or more

Origin Groups


In [210]:
match = [x==y for (x,y) in zip((df.DepDelayMinutes >= 15), df.DepDel15)]

In [67]:
quantiles = [0] + list(np.percentile(mean_dep_delay15, [20,40,60,80])) + [1.1]

In [68]:
quantiles


Out[68]:
[0,
 0.12664473684210528,
 0.15826476743747647,
 0.18312887862606436,
 0.2180524418922776,
 1.1]

In [93]:
origin_groups = []
for (low, high) in list(zip(quantiles, quantiles[1:])):
    origin_groups.append(
        set(mean_dep_delay15[(mean_dep_delay15 >= low) & (mean_dep_delay15 < high)].dropna().index)
    )

In [94]:
[len(x) for x in origin_groups]


Out[94]:
[61, 61, 61, 61, 61]

In [76]:
for i, group in enumerate(origin_groups):
    df['OriginGroup%s' % i] = [int(o in group) for o in df.Origin]

Unique Carrier One-Hot Feature


In [87]:
unique_carriers = list(set(df.UniqueCarrier))
for carrier in unique_carriers:
    df['Carrier%s' % carrier]  = [int(x == carrier) for x in df.UniqueCarrier]

Day Of Week One-Hot Feature


In [97]:
days_of_week = sorted(list(set(df.DayOfWeek)))
for dow in days_of_week:
    df['DayOfWeek%s' % dow]  = [int(x == dow) for x in df.DayOfWeek]

In [ ]:

Time of Day Bucket Feature:


In [297]:
clean_df = df[['DepTime', 'DepDelayMinutes']].dropna()

plt.scatter(x=clean_df.DepTime.iloc[:5000], y = clean_df.DepDelayMinutes.iloc[:5000])


Out[297]:
<matplotlib.collections.PathCollection at 0x115117cf8>

In [99]:
thresholds = [-1, 400, 800, 1200, 1600, 2000, 2401]

In [110]:
buckets = []
for i, (min_time, max_time) in enumerate(list(zip(thresholds, thresholds[1:]))):
    df["DepTimeBucket%s" % i] = ((df.DepTime >= min_time) & (df.DepTime < max_time)).astype(int)

In [122]:
features = (['OriginGroup%s' % i for i in range(5)] +
            ['Carrier%s' % carrier for carrier in unique_carriers] +
            ['DayOfWeek%s' % dow for dow in days_of_week] + 
            ['DepTimeBucket%s' % i for i in range(6)]
           )

In [214]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [252]:
model = LogisticRegression()

In [253]:
clean_df = df[features + ['DepDelayMinutes']].dropna()

In [264]:
clean_df['Delayed'] = [int(x >= 15) for x in clean_df.DepDelayMinutes]

In [265]:
train_size = int(len(clean_df) * 0.7)

In [266]:
model.fit(clean_df[features].iloc[:train_size], clean_df.Delayed.iloc[:train_size])


Out[266]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [267]:
predictions = model.predict(clean_df[features].iloc[train_size:])

In [268]:
actuals = clean_df.Delayed.iloc[train_size:]

In [299]:
origin_groups


Out[299]:
[{'ABR',
  'ADQ',
  'AMA',
  'ANC',
  'ATW',
  'AZO',
  'BET',
  'BFL',
  'BIL',
  'BIS',
  'BRW',
  'BTM',
  'BZN',
  'CDC',
  'CIU',
  'CPR',
  'DLG',
  'DVL',
  'EFD',
  'EKO',
  'EWN',
  'FCA',
  'FNT',
  'FSD',
  'GCC',
  'GEG',
  'GFK',
  'GJT',
  'GTF',
  'HDN',
  'HLN',
  'HNL',
  'HRL',
  'HYS',
  'IDA',
  'INL',
  'ISN',
  'ITH',
  'ITO',
  'JMS',
  'JNU',
  'KOA',
  'KTN',
  'LCH',
  'LIH',
  'LSE',
  'LWS',
  'MOT',
  'MQT',
  'MSO',
  'OGG',
  'PIH',
  'PLN',
  'PPG',
  'RKS',
  'SCC',
  'SGU',
  'SIT',
  'TWF',
  'WYS',
  'YUM'},
 {'ABY',
  'ASE',
  'BDL',
  'BGM',
  'BMI',
  'BOI',
  'BQK',
  'BRD',
  'BRO',
  'BUF',
  'CHA',
  'COD',
  'CRP',
  'CWA',
  'DAY',
  'DSM',
  'ECP',
  'EGE',
  'ERI',
  'EUG',
  'EYW',
  'FAR',
  'FWA',
  'GNV',
  'GRB',
  'GRK',
  'GRR',
  'GST',
  'GUC',
  'ILM',
  'JAC',
  'LAN',
  'LAR',
  'LBB',
  'LNK',
  'MAF',
  'MCI',
  'MHT',
  'MKE',
  'MLB',
  'MRY',
  'MTJ',
  'OKC',
  'OMA',
  'OTZ',
  'PDX',
  'PHF',
  'PIT',
  'PSC',
  'PSE',
  'RHI',
  'ROW',
  'RST',
  'SBP',
  'SDF',
  'SGF',
  'SJC',
  'SLC',
  'TLH',
  'TUL',
  'XNA'},
 {'ABQ',
  'AKN',
  'ALB',
  'APN',
  'AVP',
  'BHM',
  'BJI',
  'BUR',
  'CAK',
  'CMH',
  'DCA',
  'DRO',
  'ELP',
  'FAI',
  'FAT',
  'FLG',
  'FSM',
  'GPT',
  'GSP',
  'HIB',
  'HSV',
  'IAD',
  'ICT',
  'IMT',
  'IND',
  'ISP',
  'JAN',
  'JAX',
  'LGB',
  'LIT',
  'MBS',
  'MEM',
  'MLU',
  'MSN',
  'MSY',
  'OAK',
  'OME',
  'ONT',
  'PAH',
  'PIA',
  'PNS',
  'PSG',
  'PVD',
  'PWM',
  'RAP',
  'RNO',
  'ROC',
  'RSW',
  'SAN',
  'SAT',
  'SEA',
  'SMF',
  'SNA',
  'SPS',
  'STT',
  'SYR',
  'TRI',
  'TUS',
  'TXK',
  'UST',
  'VLD'},
 {'ACK',
  'ATL',
  'AUS',
  'BNA',
  'BOS',
  'BTR',
  'BWI',
  'CAE',
  'CHS',
  'CID',
  'CLE',
  'CLL',
  'CMX',
  'COS',
  'CRW',
  'CSG',
  'CVG',
  'DAB',
  'DTW',
  'EVV',
  'FAY',
  'GCK',
  'GSO',
  'GTR',
  'HOB',
  'HOU',
  'HYA',
  'IAH',
  'LEX',
  'LFT',
  'LRD',
  'MDW',
  'MFE',
  'MFR',
  'MGM',
  'MLI',
  'MOB',
  'MSP',
  'OAJ',
  'ORF',
  'PBI',
  'PHL',
  'PHX',
  'PIB',
  'PSP',
  'RDU',
  'RIC',
  'ROA',
  'SBA',
  'SBN',
  'SHV',
  'STL',
  'STX',
  'SUN',
  'TPA',
  'TTN',
  'TVC',
  'TYS',
  'VPS',
  'WRG',
  'YAK'},
 {'ABE',
  'ABI',
  'ACT',
  'ACV',
  'ACY',
  'ADK',
  'AEX',
  'AGS',
  'AVL',
  'BGR',
  'BPT',
  'BQN',
  'BTV',
  'CDV',
  'CHO',
  'CLT',
  'DAL',
  'DEN',
  'DFW',
  'DHN',
  'DLH',
  'EAU',
  'ELM',
  'ESC',
  'EWR',
  'FLL',
  'GGG',
  'GRI',
  'GUM',
  'HPN',
  'IAG',
  'JFK',
  'JLN',
  'LAS',
  'LAW',
  'LAX',
  'LBE',
  'LGA',
  'MCO',
  'MDT',
  'MEI',
  'MIA',
  'MKG',
  'MVY',
  'MYR',
  'ORD',
  'ORH',
  'OTH',
  'PBG',
  'RDD',
  'RDM',
  'SAF',
  'SAV',
  'SCE',
  'SFO',
  'SJT',
  'SJU',
  'SMX',
  'SPI',
  'SRQ',
  'SWF'}]

In [ ]:


In [275]:
from sklearn.metrics import roc_curve, auc

In [276]:
predict_probs = [tpl[1] for tpl in model.predict_proba(clean_df[features].iloc[train_size:])]

In [277]:
# Compute micro-average ROC curve and ROC area
fpr, tpr, _ = roc_curve(actuals, predict_probs)

In [278]:
import matplotlib.pyplot as plt

In [ ]:


In [279]:
#Plot of a ROC curve for a specific class
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


/Users/kevin/anaconda/envs/calhacks/lib/python3.5/site-packages/matplotlib/axes/_axes.py:531: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [292]:


In [280]:
auc(fpr, tpr)


Out[280]:
0.68482183879917058

In [281]:
model.intercept_


Out[281]:
array([-0.88227567])

In [282]:
coefs = dict(list(zip(features, model.coef_[0])))

In [ ]:


In [283]:
coefs


Out[283]:
{'CarrierAA': -0.0019204985425103213,
 'CarrierAS': -0.84841944514035605,
 'CarrierB6': 0.12241821143901417,
 'CarrierDL': -0.13261989508615579,
 'CarrierEV': -0.010973177444743456,
 'CarrierF9': 0.0,
 'CarrierHA': 0.0,
 'CarrierNK': 0.0,
 'CarrierOO': -0.023123225427465505,
 'CarrierUA': 0.16964701242785432,
 'CarrierVX': -0.053647076481738616,
 'CarrierWN': -0.10363757129666461,
 'DayOfWeek1': 0.063727159827779406,
 'DayOfWeek2': -0.1853191855446592,
 'DayOfWeek3': -0.30014428231028273,
 'DayOfWeek4': -0.094916176670324648,
 'DayOfWeek5': -0.062572400878981818,
 'DayOfWeek6': -0.084710367475524101,
 'DayOfWeek7': -0.21834041248771782,
 'DepTimeBucket0': 1.3627774299145421,
 'DepTimeBucket1': -1.9928496577654837,
 'DepTimeBucket2': -0.82127521506520362,
 'DepTimeBucket3': -0.27652436329447116,
 'DepTimeBucket4': 0.17856868964922734,
 'DepTimeBucket5': 0.66702745101136107,
 'OriginGroup0': -0.67761214167229855,
 'OriginGroup1': -0.25382448266698804,
 'OriginGroup2': -0.11041084237121421,
 'OriginGroup3': -0.067072611283875219,
 'OriginGroup4': 0.22664441244453568}