In [1]:
%matplotlib inline
In [2]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import pandas as pd
print('Generating rows to skip')
s = 10000 # desired sample size
n = 5967780
path = '../../data/2001/2001.csv'
rows_to_skip = sorted(np.random.choice(np.arange(1, n + 1), (n - s), replace=False))
print('Rows to skip: ', len(rows_to_skip))
print('Loading data')
# http://pandas.pydata.org/pandas-docs/stable/io.html#date-handling
df = pd.read_csv(path,
encoding='iso-8859-1', engine='c',
skiprows=rows_to_skip,
parse_dates=[['Year', 'Month', 'DayofMonth']]
)
print('Data loaded')
In [3]:
!wc -l ../../data/2001/2001.csv
In [4]:
df.count()
Out[4]:
In [5]:
df.index = pd.to_datetime(df.pop('Year_Month_DayofMonth'))
In [6]:
df.head()
Out[6]:
In [7]:
df.Cancelled['2001-09-10'].mean()
Out[7]:
In [8]:
df.Cancelled['2001-09-11'].mean()
Out[8]:
In [9]:
df.Cancelled['2001-09-12'].mean()
Out[9]:
In [10]:
df.Cancelled['2001-09-13'].mean()
Out[10]:
In [11]:
rolling = df.Cancelled.rolling(window=30,center=False).mean()
complete = rolling.dropna()
complete.plot()
# resampled = complete.resample('d').mean()
# resampled
# resampled.plot()
# rolling.plot()
Out[11]:
In [12]:
grouped = df.groupby(df.UniqueCarrier)
means = grouped.mean()[['DepDelay', 'ArrDelay']]
means
Out[12]:
In [13]:
# http://pandas.pydata.org/pandas-docs/stable/visualization.html
%pylab inline
import matplotlib
# matplotlib.style.use('ggplot')
# means.plot(kind='barh')
# means.plot(kind='bar')
# means.plot.bar(stacked=True)
means.plot.bar()
Out[13]:
In [14]:
text_cols = [u'UniqueCarrier' , u'Origin', u'Dest']
le = preprocessing.LabelEncoder()
for c in text_cols:
# print (c,set(df[c].values))
flist = list(set(df[c].values))
# print(flist)
le.fit(flist)
leo = le.transform(flist)
# print (c,flist,leo)
df[c+'_'] = df[c]
df[c+'_'].replace(flist,value=leo,inplace=True)
df.fillna(-1, inplace=True)
cols_for_correlation = [
u'DayOfWeek',
u'DepTime',
u'ArrTime',
u'ArrDelay',
u'Distance',
u'UniqueCarrier_',
u'Origin_',
u'Dest_'
]
plt.clf()
corrmat = df[cols_for_correlation].corr()
sns.heatmap(corrmat, annot=True)
figure = plt.gcf()
figure.set_size_inches(10, 10)
plt.show()
In [15]:
def plot(col1, col2):
plt.clf()
# https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.jointplot.html#seaborn.jointplot
sns.jointplot(df[col1],df[col2],dropna=True, kind="hex")
figure = plt.gcf()
figure.set_size_inches(10, 10)
plt.show()
In [16]:
plot('ArrTime', 'DepTime')
In [17]:
plot('Distance', 'UniqueCarrier_')
In [18]:
plot('Origin_', 'UniqueCarrier_')