In [1]:
%matplotlib inline
%pylab inline
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import pandas as pd
import dask.dataframe as dd
In [4]:
IMG_DIR = '../../analysis'
DPI=120
In [5]:
# http://dask.pydata.org/en/latest/dataframe-overview.html
%time lazy_df = dd.read_csv('../../data/raw/2001.csv', encoding='iso-8859-1')
In [6]:
%time len(lazy_df)
Out[6]:
In [31]:
# http://dask.pydata.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.sample
s = 10000 # desired sample size
n = 5967780
fraction = s / n
df = lazy_df.sample(fraction)
In [8]:
%time len(df)
Out[8]:
In [9]:
df.head()
Out[9]:
In [10]:
# first turn our 10000 samples into a normal pandas df for convenience
%time df = df.compute()
In [11]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# turn those text labels into numerical
text_cols = ['UniqueCarrier', 'Origin', 'Dest']
le = preprocessing.LabelEncoder()
for c in text_cols:
# print (c,set(df[c].values))
flist = list(set(df[c].values))
# print(flist)
le.fit(flist)
leo = le.transform(flist)
# print (c,flist,leo)
df[c+'_'] = df[c]
df[c+'_'].replace(flist,value=leo,inplace=True)
In [12]:
df.head()
Out[12]:
In [13]:
df.fillna(-1, inplace=True)
In [14]:
df.head()
Out[14]:
In [15]:
cols_for_correlation = [
'DayOfWeek',
'DepTime',
'ArrTime',
'ArrDelay',
'Distance',
'UniqueCarrier_',
'Origin_',
'Dest_'
]
In [16]:
corrmat = df[cols_for_correlation].corr()
sns.heatmap(corrmat, annot=True)
figure = plt.gcf()
figure.set_size_inches(10, 10)
# plt.show()
plt.savefig(IMG_DIR+'/corr.png', dpi = DPI)
In [17]:
def plot(col1, col2):
# https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.jointplot.html#seaborn.jointplot
sns.jointplot(df[col1],df[col2],dropna=True, kind="hex")
figure = plt.gcf()
figure.set_size_inches(10, 10)
# for notebook
# plt.show()
plt.savefig('%s/%s_%s.png'%(IMG_DIR, col1, col2), dpi = DPI)
In [18]:
plot('ArrTime', 'DepTime')
In [19]:
plot('Distance', 'UniqueCarrier_')
In [20]:
plot('Origin_', 'UniqueCarrier_')
In [21]:
# 2400 is not a valid time
df['CRSDepTime'] = df.apply(lambda row: 2359 if row['CRSDepTime'] == 2400 else row['CRSDepTime'],axis='columns')
In [22]:
df['@timestamp'] = df.apply(lambda row: pd.Timestamp('%s-%s-%s;%04d'%(row['Year'], row['Month'], row['DayofMonth'], row['CRSDepTime'])),axis='columns')
In [28]:
df.head()
Out[28]:
In [23]:
timestamps = df['@timestamp']
In [24]:
plt.hist?
In [25]:
plt.hist(timestamps.tolist(), bins=365, histtype = 'step', color='black')
plt.show()
In [26]:
10000 / 365
Out[26]:
In [27]:
plt.hist(timestamps.tolist(), bins=12, histtype = 'bar')
plt.show()
In [29]:
df['Cancelled'] = df.apply(lambda row: False if row['Cancelled'] == 0 else True, axis='columns')
df['Diverted'] = df.apply(lambda row: False if row['Diverted'] == 0 else True, axis='columns')
In [30]:
df.head()
Out[30]:
In [ ]: