In [13]:
import pandas as pd
%matplotlib inline

In [14]:
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
import statsmodels.formula.api as smf

In [15]:
df = pd.read_csv("data/ontime_reports_may_2015_ny.csv")

In [16]:
df.head()


Out[16]:
YEAR MONTH DAY_OF_MONTH CARRIER FL_NUM ORIGIN DEST DEP_DELAY ARR_DELAY CANCELLED ... DIVERTED ACTUAL_ELAPSED_TIME AIR_TIME DISTANCE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY Unnamed: 20
0 2015 5 1 AA 44 LAS JFK -6.0 -17.0 0.0 ... 0.0 294.0 273.0 2248.0 NaN NaN NaN NaN NaN NaN
1 2015 5 2 AA 44 LAS JFK -8.0 -14.0 0.0 ... 0.0 299.0 280.0 2248.0 NaN NaN NaN NaN NaN NaN
2 2015 5 3 AA 44 LAS JFK 0.0 -11.0 0.0 ... 0.0 294.0 274.0 2248.0 NaN NaN NaN NaN NaN NaN
3 2015 5 4 AA 44 LAS JFK -11.0 4.0 0.0 ... 0.0 320.0 275.0 2248.0 NaN NaN NaN NaN NaN NaN
4 2015 5 5 AA 44 LAS JFK -4.0 -18.0 0.0 ... 0.0 291.0 270.0 2248.0 NaN NaN NaN NaN NaN NaN

5 rows × 21 columns


In [18]:
df.corr()['ARR_DELAY'].sort_values(ascending=False)


Out[18]:
ARR_DELAY              1.000000
DEP_DELAY              0.945356
CARRIER_DELAY          0.508489
LATE_AIRCRAFT_DELAY    0.499341
NAS_DELAY              0.424661
WEATHER_DELAY          0.219410
DAY_OF_MONTH           0.104742
ACTUAL_ELAPSED_TIME    0.029697
FL_NUM                 0.018313
SECURITY_DELAY        -0.011483
AIR_TIME              -0.014512
DISTANCE              -0.036939
YEAR                        NaN
MONTH                       NaN
CANCELLED                   NaN
DIVERTED                    NaN
Unnamed: 20                 NaN
Name: ARR_DELAY, dtype: float64

In [19]:
from pandas.tools.plotting import scatter_matrix

In [ ]:
scatter_matrix(df[[ u'DEP_DELAY', u'ARR_DELAY', u'ACTUAL_ELAPSED_TIME',
       u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY', u'SECURITY_DELAY',
       u'LATE_AIRCRAFT_DELAY']],alpha=0.2, figsize=(20, 20), diagonal='kde')

In [ ]:
lm = smf.ols(formula="DEP_DELAY ~ LATE_AIRCRAFT_DELAY + CARRIER_DELAY + NAS_DELAY",data=df).fit()

In [ ]:
lm.params

In [ ]: