In [1]:
%matplotlib inline
%pylab inline
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import pandas as pd
import dask.dataframe as dd
In [36]:
# we need to make sure we have 0.20 for Plotting
# !conda update pandas -y
print(pd.__version__)
In [4]:
IMG_DIR = './analysis'
DPI=120
In [5]:
# http://dask.pydata.org/en/latest/dataframe-overview.html
%time lazy_df = dd.read_csv('./data/2001.csv', encoding='iso-8859-1')
In [6]:
%time len(lazy_df)
Out[6]:
In [7]:
# http://dask.pydata.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.sample
s = 10000 # desired sample size
n = 5967780
fraction = s / n
df = lazy_df.sample(fraction)
In [8]:
%time len(df)
Out[8]:
In [9]:
df.head()
Out[9]:
In [10]:
# first turn our 10000 samples into a normal pandas df for convenience
%time df = df.compute()
In [11]:
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
# turn those text labels into numerical
text_cols = ['UniqueCarrier', 'Origin', 'Dest']
le = preprocessing.LabelEncoder()
for c in text_cols:
# print (c,set(df[c].values))
flist = list(set(df[c].values))
# print(flist)
le.fit(flist)
leo = le.transform(flist)
# print (c,flist,leo)
df[c+'_'] = df[c]
df[c+'_'].replace(flist,value=leo,inplace=True)
In [12]:
df.head()
Out[12]:
In [13]:
df.fillna(-1, inplace=True)
In [14]:
df.head()
Out[14]:
In [15]:
cols_for_correlation = [
'DayOfWeek',
'DepTime',
'ArrTime',
'ArrDelay',
'Distance',
'UniqueCarrier_',
'Origin_',
'Dest_'
]
In [16]:
corrmat = df[cols_for_correlation].corr()
sns.heatmap(corrmat, annot=True)
figure = plt.gcf()
figure.set_size_inches(10, 10)
plt.show()
# plt.savefig(IMG_DIR+'/corr.png', dpi = DPI)
In [17]:
def plot(col1, col2):
# https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.jointplot.html#seaborn.jointplot
sns.jointplot(df[col1],df[col2],dropna=True, kind="hex")
figure = plt.gcf()
figure.set_size_inches(10, 10)
# for notebook
plt.show()
# plt.savefig('%s/%s_%s.png'%(IMG_DIR, col1, col2), dpi = DPI)
In [18]:
plot('ArrTime', 'DepTime')
In [19]:
plot('Distance', 'UniqueCarrier_')
In [20]:
plot('Origin_', 'UniqueCarrier_')
In [34]:
# not idea how to interpret this
plot('ArrDelay', 'DepTime')
In [42]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cols_for_scatter = [
'DayOfWeek',
'DepTime',
'ArrTime',
'ArrDelay',
'DepDelay',
'Distance',
]
plt.clf()
plt.figure(figsize=(20, 20))
pd.plotting.scatter_matrix(df[cols_for_scatter], figsize=(20, 20), diagonal='kde')
plt.show()
In [43]:
distance = df['Distance']
In [50]:
airtime = df['AirTime']
In [56]:
plt.scatter(airtime, distance, color='black')
plt.xlabel('Airtime, minutes')
plt.ylabel('Distance, miles')
Out[56]:
In [107]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
In [108]:
len(airtime)
Out[108]:
In [109]:
airtime_train = airtime[:-2000]
airtime_test = airtime[-2000:]
distance_train = distance[:-2000]
distance_test = distance[-2000:]
In [110]:
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(airtime_train.reshape(-1, 1), distance_train.reshape(-1, 1))
Out[110]:
In [111]:
distance_pred = regr.predict(airtime_test.reshape(-1, 1))
In [112]:
mean_squared_error(distance_test, distance_pred)
Out[112]:
In [114]:
# Explained variance score: 1 is perfect prediction, pretty good score
r2_score(distance_test, distance_pred)
Out[114]:
In [115]:
plt.scatter(airtime_test, distance_test, color='black')
plt.xlabel('Airtime, minutes')
plt.ylabel('Distance, miles')
plt.plot(airtime_test, distance_pred, color='red', linewidth=3)
Out[115]:
In [119]:
# approx. 442 miles / hour
regr.predict(60)
Out[119]:
In [117]:
# km /h (surprisingly low), maybe a lot of start and landing phases
442 * 1.60934
Out[117]:
In [21]:
# 2400 is not a valid time
df['CRSDepTime'] = df.apply(lambda row: 2359 if row['CRSDepTime'] == 2400 else row['CRSDepTime'],axis='columns')
In [22]:
df['@timestamp'] = df.apply(lambda row: pd.Timestamp('%s-%s-%s;%04d'%(row['Year'], row['Month'], row['DayofMonth'], row['CRSDepTime'])),axis='columns')
In [23]:
df.head()
Out[23]:
In [24]:
timestamps = df['@timestamp']
In [25]:
# plt.hist?
In [26]:
plt.hist(timestamps.tolist(), bins=365, histtype = 'step', color='black')
plt.show()
In [27]:
10000 / 365
Out[27]:
In [28]:
plt.hist(timestamps.tolist(), bins=12, histtype = 'bar')
plt.show()
In [29]:
df['Cancelled'] = df.apply(lambda row: False if row['Cancelled'] == 0 else True, axis='columns')
df['Diverted'] = df.apply(lambda row: False if row['Diverted'] == 0 else True, axis='columns')
In [30]:
df.head()
Out[30]:
In [ ]: