In [1]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

import seaborn as sns

import pandas as pd

In [4]:
IMG_DIR = '../../analysis'
DPI=120

Generating rows to skip


In [5]:
s = 10000  # desired sample size
n = 5967780
%time rows_to_skip = sorted(np.random.choice(np.arange(1, n + 1), (n - s), replace=False))
len(rows_to_skip)


CPU times: user 8.66 s, sys: 281 ms, total: 8.94 s
Wall time: 8.94 s
Out[5]:
5957780

Loading data


In [6]:
%time df = pd.read_csv('../../data/2001.csv', encoding='iso-8859-1', engine='c', skiprows=rows_to_skip)
len(df)


CPU times: user 5.62 s, sys: 469 ms, total: 6.09 s
Wall time: 6.17 s
Out[6]:
10000

In [7]:
df.head()


Out[7]:
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum ... TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
0 2001 1 27 6 1812.0 1810 1928.0 1934 US 375 ... 4 10 0 NaN 0 NaN NaN NaN NaN NaN
1 2001 1 6 6 710.0 715 946.0 954 US 428 ... 4 14 0 NaN 0 NaN NaN NaN NaN NaN
2 2001 1 17 3 1822.0 1829 2022.0 2028 US 435 ... 6 28 0 NaN 0 NaN NaN NaN NaN NaN
3 2001 1 25 4 1858.0 1850 2035.0 2031 US 455 ... 8 12 0 NaN 0 NaN NaN NaN NaN NaN
4 2001 1 29 1 1201.0 1140 1328.0 1303 US 457 ... 8 18 0 NaN 0 NaN NaN NaN NaN NaN

5 rows × 29 columns

Create numeral versions of categoricals for later analysis


In [8]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# turn those text labels into numerical
text_cols = ['UniqueCarrier', 'Origin', 'Dest']
le = preprocessing.LabelEncoder()
for c in text_cols:
    # print (c,set(df[c].values))
    flist = list(set(df[c].values))
    # print(flist)
    le.fit(flist)
    leo = le.transform(flist)
    # print (c,flist,leo)
    df[c+'_'] = df[c]
    df[c+'_'].replace(flist,value=leo,inplace=True)

In [9]:
df.head()


Out[9]:
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum ... CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay UniqueCarrier_ Origin_ Dest_
0 2001 1 27 6 1812.0 1810 1928.0 1934 US 375 ... NaN 0 NaN NaN NaN NaN NaN 10 33 38
1 2001 1 6 6 710.0 715 946.0 954 US 428 ... NaN 0 NaN NaN NaN NaN NaN 10 196 38
2 2001 1 17 3 1822.0 1829 2022.0 2028 US 435 ... NaN 0 NaN NaN NaN NaN NaN 10 26 158
3 2001 1 25 4 1858.0 1850 2035.0 2031 US 455 ... NaN 0 NaN NaN NaN NaN NaN 10 43 94
4 2001 1 29 1 1201.0 1140 1328.0 1303 US 457 ... NaN 0 NaN NaN NaN NaN NaN 10 161 180

5 rows × 32 columns

Reaplace NaN with -1 (we have plenty of them)


In [10]:
df.fillna(-1, inplace=True)

In [11]:
df.head()


Out[11]:
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum ... CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay UniqueCarrier_ Origin_ Dest_
0 2001 1 27 6 1812.0 1810 1928.0 1934 US 375 ... -1.0 0 -1.0 -1.0 -1.0 -1.0 -1.0 10 33 38
1 2001 1 6 6 710.0 715 946.0 954 US 428 ... -1.0 0 -1.0 -1.0 -1.0 -1.0 -1.0 10 196 38
2 2001 1 17 3 1822.0 1829 2022.0 2028 US 435 ... -1.0 0 -1.0 -1.0 -1.0 -1.0 -1.0 10 26 158
3 2001 1 25 4 1858.0 1850 2035.0 2031 US 455 ... -1.0 0 -1.0 -1.0 -1.0 -1.0 -1.0 10 43 94
4 2001 1 29 1 1201.0 1140 1328.0 1303 US 457 ... -1.0 0 -1.0 -1.0 -1.0 -1.0 -1.0 10 161 180

5 rows × 32 columns


In [12]:
cols_for_correlation = [
    'DayOfWeek',
    'DepTime',
    'ArrTime',
    'ArrDelay',
    'Distance',
    'UniqueCarrier_',
    'Origin_',
    'Dest_'
]

In [13]:
corrmat = df[cols_for_correlation].corr()
sns.heatmap(corrmat, annot=True)
figure = plt.gcf()
figure.set_size_inches(10, 10)
# plt.show()
plt.savefig(IMG_DIR+'/corr.png', dpi = DPI)



In [14]:
def plot(col1, col2):
    # https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.jointplot.html#seaborn.jointplot
    sns.jointplot(df[col1],df[col2],dropna=True, kind="hex")
    figure = plt.gcf()
    figure.set_size_inches(10, 10)
    # for notebook
    # plt.show()
    plt.savefig('%s/%s_%s.png'%(IMG_DIR, col1, col2), dpi = DPI)

In [15]:
plot('ArrTime', 'DepTime')



In [16]:
plot('Distance', 'UniqueCarrier_')



In [17]:
plot('Origin_', 'UniqueCarrier_')



In [ ]:


In [ ]: