In [1]:
import pandas as pd
%matplotlib inline

In [6]:
import matplotlib.pyplot as plt # package for doing plotting (necessary for adding the line)
from sklearn.linear_model import LinearRegression # package for doing linear regression (there are others)
import numpy as np # necessary for working with the data and getting it properly shaped

In [2]:
df = pd.read_csv('data/ontime_reports_may_2015_ny.csv')

In [3]:
df.corr()['DEP_DELAY'].order()


Out[3]:
FL_NUM                -0.010442
SECURITY_DELAY        -0.006269
DISTANCE              -0.001030
AIR_TIME               0.002656
ACTUAL_ELAPSED_TIME    0.017203
CANCELLED              0.024350
DIVERTED               0.039431
DAY_OF_MONTH           0.096697
WEATHER_DELAY          0.204236
NAS_DELAY              0.300172
CARRIER_DELAY          0.530374
LATE_AIRCRAFT_DELAY    0.534286
ARR_DELAY              0.945356
DEP_DELAY              1.000000
YEAR                        NaN
MONTH                       NaN
Unnamed: 20                 NaN
Name: DEP_DELAY, dtype: float64

In [4]:
from pandas.tools.plotting import scatter_matrix

In [5]:
scatter_matrix(df[[ u'DEP_DELAY', u'ARR_DELAY', u'ACTUAL_ELAPSED_TIME',
       u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY', u'SECURITY_DELAY',
       u'LATE_AIRCRAFT_DELAY']],alpha=0.2, figsize=(20, 20), diagonal='kde')


Out[5]:
array([[<matplotlib.axes.AxesSubplot object at 0x1071d9610>,
        <matplotlib.axes.AxesSubplot object at 0x1076c9250>,
        <matplotlib.axes.AxesSubplot object at 0x1077bd2d0>,
        <matplotlib.axes.AxesSubplot object at 0x107c63c90>,
        <matplotlib.axes.AxesSubplot object at 0x107e85b50>,
        <matplotlib.axes.AxesSubplot object at 0x107b9ea10>,
        <matplotlib.axes.AxesSubplot object at 0x1077667d0>,
        <matplotlib.axes.AxesSubplot object at 0x1038634d0>],
       [<matplotlib.axes.AxesSubplot object at 0x10738e390>,
        <matplotlib.axes.AxesSubplot object at 0x107506410>,
        <matplotlib.axes.AxesSubplot object at 0x1075b6ed0>,
        <matplotlib.axes.AxesSubplot object at 0x10772ac90>,
        <matplotlib.axes.AxesSubplot object at 0x107b21f90>,
        <matplotlib.axes.AxesSubplot object at 0x107cc6e50>,
        <matplotlib.axes.AxesSubplot object at 0x107d23990>,
        <matplotlib.axes.AxesSubplot object at 0x107db1c50>],
       [<matplotlib.axes.AxesSubplot object at 0x108d6cdd0>,
        <matplotlib.axes.AxesSubplot object at 0x109059650>,
        <matplotlib.axes.AxesSubplot object at 0x1090dd410>,
        <matplotlib.axes.AxesSubplot object at 0x109244490>,
        <matplotlib.axes.AxesSubplot object at 0x1092d3410>,
        <matplotlib.axes.AxesSubplot object at 0x109323e50>,
        <matplotlib.axes.AxesSubplot object at 0x1093c3150>,
        <matplotlib.axes.AxesSubplot object at 0x1093f92d0>],
       [<matplotlib.axes.AxesSubplot object at 0x1094aab10>,
        <matplotlib.axes.AxesSubplot object at 0x1096318d0>,
        <matplotlib.axes.AxesSubplot object at 0x1096845d0>,
        <matplotlib.axes.AxesSubplot object at 0x109715550>,
        <matplotlib.axes.AxesSubplot object at 0x1096c9410>,
        <matplotlib.axes.AxesSubplot object at 0x1097ff350>,
        <matplotlib.axes.AxesSubplot object at 0x1098872d0>,
        <matplotlib.axes.AxesSubplot object at 0x1098ebad0>],
       [<matplotlib.axes.AxesSubplot object at 0x109a72890>,
        <matplotlib.axes.AxesSubplot object at 0x109ad7f90>,
        <matplotlib.axes.AxesSubplot object at 0x109b68b10>,
        <matplotlib.axes.AxesSubplot object at 0x109b1eb10>,
        <matplotlib.axes.AxesSubplot object at 0x109d53d10>,
        <matplotlib.axes.AxesSubplot object at 0x109ddabd0>,
        <matplotlib.axes.AxesSubplot object at 0x109e4c4d0>,
        <matplotlib.axes.AxesSubplot object at 0x109ed3290>],
       [<matplotlib.axes.AxesSubplot object at 0x10a038990>,
        <matplotlib.axes.AxesSubplot object at 0x10a0c9510>,
        <matplotlib.axes.AxesSubplot object at 0x10a07d510>,
        <matplotlib.axes.AxesSubplot object at 0x10a1b5710>,
        <matplotlib.axes.AxesSubplot object at 0x10a23b5d0>,
        <matplotlib.axes.AxesSubplot object at 0x10a2a0e90>,
        <matplotlib.axes.AxesSubplot object at 0x10a325c50>,
        <matplotlib.axes.AxesSubplot object at 0x10a39b390>],
       [<matplotlib.axes.AxesSubplot object at 0x10a51ced0>,
        <matplotlib.axes.AxesSubplot object at 0x10a3d3ed0>,
        <matplotlib.axes.AxesSubplot object at 0x10a616110>,
        <matplotlib.axes.AxesSubplot object at 0x10a690f90>,
        <matplotlib.axes.AxesSubplot object at 0x10a6e35d0>,
        <matplotlib.axes.AxesSubplot object at 0x10b0681d0>,
        <matplotlib.axes.AxesSubplot object at 0x10b0c4f90>,
        <matplotlib.axes.AxesSubplot object at 0x10b15ec90>],
       [<matplotlib.axes.AxesSubplot object at 0x10b199e10>,
        <matplotlib.axes.AxesSubplot object at 0x10b24bd50>,
        <matplotlib.axes.AxesSubplot object at 0x10b2d1b10>,
        <matplotlib.axes.AxesSubplot object at 0x10b538a90>,
        <matplotlib.axes.AxesSubplot object at 0x10b5c9a10>,
        <matplotlib.axes.AxesSubplot object at 0x10b723490>,
        <matplotlib.axes.AxesSubplot object at 0x10b7b5750>,
        <matplotlib.axes.AxesSubplot object at 0x10b7f08d0>]], dtype=object)

In [7]:
lr = LinearRegression()

In [11]:
data = np.asarray(df[['DEP_DELAY','LATE_AIRCRAFT_DELAY','CARRIER_DELAY','NAS_DELAY']].fillna(0)) #convert the dataframe to a nparray
x, y = data[:, 1:], data[:, 0] #assign the values to variables and reshape the data

In [12]:
lr.fit(x,y) #fit the data to the values


Out[12]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [13]:
m = lr.coef_

In [14]:
m


Out[14]:
array([ 1.07649627,  1.03713069,  0.84402415])

In [15]:
b = lr.intercept_

In [16]:
b


Out[16]:
-1.0086983381902872

In [18]:
lr.score(x,y)


Out[18]:
0.87595697846760023

In [ ]: