In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.externals import joblib
import src.misc.paths as path
import src.vector_gen.generateWeatherVectors as gwv
%matplotlib inline
training_files = "../../dataset/training/"
trajectories_file = "trajectories(table 5)_training.csv"
trajectories_df = pd.read_csv(training_files+trajectories_file)
In [2]:
x, y = gwv.generate_timeInformationVectors(trajectories_df)
x_df = gwv.generate_timeInformationVectorX_df(trajectories_df, True)# using arrays -.-
#x_df = pd.DataFrame(np.reshape(x,(len(x)/3,3)), columns=['dayofweek', 'hour', 'minute'])
y_df = pd.DataFrame(y, columns=['avg_travel_time'])
In [3]:
x
Out[3]:
In [4]:
x_df['date'] = x_df['datetime'].dt.date
x_df = x_df.set_index(['date','hour','minute'])
x_df.head()
Out[4]:
In [5]:
import itertools
t0 = list(x_df['datetime'])
route_touples = [('A', 2), ('A', 3), ('B', 1), ('B', 3), ('C', 1), ('C', 3)]
index_touples = list(itertools.product(t0, route_touples))
index_touples
# fix
res = []
for z in t0:
for i_route, route in enumerate(route_touples):
tmp = []
tmp.append(z)
tmp.append(route[0])
tmp.append(route[1])
tmp.append(i_route)
res.append(tmp)
res[:10]
Out[5]:
In [6]:
#df2 =pd.DataFrame(x.reshape((len(x)/3,3)), columns=feature_cols)
df = pd.DataFrame(res, columns=['datetime', 'intersection_id', 'tollgate_id', 'route'])
df['datetime'] = pd.to_datetime(df['datetime'])
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['dayofweek'] = df['datetime'].dt.dayofweek
df = df.join(y_df)
df.head()
Out[6]:
In [7]:
y
Out[7]:
In [8]:
y_df.head()
Out[8]:
In [9]:
df['hour_index'] = df['hour']
df['minute_index'] = df['minute']
df = df.set_index(['date', 'hour_index', 'minute_index'])
In [10]:
df[['intersection_id', 'tollgate_id', 'route', 'hour', 'minute', 'dayofweek', 'avg_travel_time']].head(10)
Out[10]:
In [11]:
feature_cols = ['route', 'hour', 'minute', 'dayofweek']
predict_cols = ['avg_travel_time']
#feature_cols = ['hour', 'minute', 'dayofweek']
tmp_all_cols = feature_cols.copy()
tmp_all_cols.extend(predict_cols)
df.reset_index()[tmp_all_cols].head(10)
Out[11]:
In [12]:
#from sklearn.model_selection import train_test_split
# not working!?!?!
import src.misc.split_train_valid as split
#training, validation, testing = split.split_dataset(x_df, 0.8, 0)
#x_train, x_test, y_train, y_test = train_test_split(df[feature_cols], df['avg_travel_time'], test_size=0.2, random_state=42)
# k-fold cross validation
# 13 weeks
# by hand?
# 91 days -> 13 weeks
# 8 weeks to train
num_weeks_train = (7*24*3*6) * 8
x_train = df[feature_cols][:num_weeks_train]
x_test = df[feature_cols][num_weeks_train:]
y_train = df['avg_travel_time'][:num_weeks_train]
y_test = df['avg_travel_time'][num_weeks_train:]
In [13]:
%%time
# load
#svr_rbf = joblib.load('svr_rbf.pkl')
# train
svr_rbf = svm.SVR(kernel='rbf', cache_size=2000, epsilon=0.5, C=100.0)
svr_rbf.fit(x_train, y_train)
print(svr_rbf)
In [ ]:
# no reasonable result
#%%time
#svr_sigmoid = svm.SVR(kernel='sigmoid', cache_size=6000)
#svr_sigmoid.fit(x_train, y_train)
#print(svr_sigmoid)
In [16]:
%%time
# load
svr_lin = joblib.load('svr_lin.pkl')
# train
#svr_lin = svm.SVR(kernel='linear', cache_size=6000)
#svr_lin.fit(x_train, y_train)
print(svr_lin)
In [ ]:
# takes 4 ever!
#%%time
#svr_poly = svm.SVR(kernel='poly', cache_size=6000)
#svr_poly.fit(x_train, y_train)
#print(svr_poly)
In [14]:
from sklearn.externals import joblib
joblib.dump(svr_rbf, 'svr_rbf.pkl')
#joblib.dump(svr_sigmoid, 'svr_sigmoid.pkl')
joblib.dump(svr_lin, 'svr_lin.pkl')
#joblib.dump(svr_poly, 'svr_poly.pkl')
Out[14]:
In [17]:
y_pred_rbf = svr_rbf.predict(x_test)
#y_pred_sigmoid = svr_sigmoid.predict(x_test)
y_pred_lin = svr_lin.predict(x_test)
#y_pred_poly = svr_poly.predict(x_test)
In [18]:
res = pd.DataFrame(data= {'y_test':np.array(y_test),
'y_pred_rbf':y_pred_rbf,
# 'y_pred_sigmoid':y_pred_sigmoid,
'y_pred_lin':y_pred_lin,
# 'y_pred_poly':y_pred_poly
})
#res
In [19]:
res = res.copy()
# train by route?
#select_route = 3
#res = res.copy().loc[range(select_route,len(res),6)].reset_index(drop=True)
# hours
#res = res.copy().loc[:2*3*6].reset_index(drop=True)
res.head(7)
Out[19]:
In [21]:
from sklearn import metrics
import src.misc.evaluation as evaluation_mape
print('sklearn mean_squared_error:')
print('y_pred_rbf:', metrics.mean_squared_error(res['y_pred_rbf'], res['y_test']))
#print('y_pred_sigmoid', metrics.mean_squared_error(res['y_pred_sigmoid'], res['y_test']))
print('y_pred_lin:', metrics.mean_squared_error(res['y_pred_lin'], res['y_test']))
print('')
print ('sklearn mean_absolute_error')
print('y_pred_rbf:', metrics.mean_absolute_error(res['y_pred_rbf'], res['y_test']))
print('y_pred_lin:', metrics.mean_absolute_error(res['y_pred_lin'], res['y_test']))
print('')
print('mape(mean_absolute_error):')
print('lower is better, 2havg submission had 0.2116 and rank 141')
print('y_pred_rbf:', evaluation_mape.mape(res['y_pred_rbf'], res['y_test']))
print('y_pred_lin:', evaluation_mape.mape(res['y_pred_lin'], res['y_test']))
In [22]:
import matplotlib.pyplot as plt
alpha=0.8
lw=0.7
fig, ax = plt.subplots(figsize = (26,8))
ax.scatter(res.index.values, res['y_test'], color='green', label='y_test', s=0.8)
ax.plot(res.index.values, res['y_pred_rbf'], color='red', label='y_pred_rbf', alpha=alpha, lw=lw)
#plt.plot(res.index.values, res['y_pred_sigmoid'], color='blue', label='y_pred_sigmoid', alpha=alpha, lw=lw)
ax.plot(res.index.values, res['y_pred_lin'], color='orange', label='y_pred_lin', alpha=alpha, lw=lw)
#plt.plot(res.index.values, res['y_pred'], color='darkorange', label='y_pred_rbf')
ax.set_title('SVN on TimeInformation with rbf and linear kernel')
ax.set_xlabel('index(route,dayofweek,hour,minute)')
ax.set_ylabel('avg_travel_time')
ax.set_ylim(0,200)
ax.set_xlim(0)
ax.legend(shadow=True, fancybox=True)
#fig.legend()
'''
I also want to share my recent results. Maybe it helps or inspire someone.
SVN on TimeInformation with rbf and linear kernel.
Trained on the first 8 weeks and tested with the remaining data. (not shuffled)
'''
Out[22]:
In [23]:
fig, ax = plt.subplots(figsize = (26,8))
ax.plot(res.index.values, res['y_pred_rbf'], color='red', label='y_pred_rbf', alpha=alpha, lw=lw)
Out[23]:
In [ ]:
from sklearn.model_selection import GridSearchCV
svr = GridSearchCV(svm.SVR(kernel='rbf', cache_size=3000),
param_grid={"epsilon": [0.1],
"C": [ 150, 175, 200]},
n_jobs=4)
svr.fit(x_train, y_train)
#, epsilon=0.5, C=100.0)
#svr_rbf.fit(x_train, y_train))
In [103]:
print(svr.best_estimator_)
print(svr.best_params_)
pd.DataFrame(svr.cv_results_).sort_values('rank_test_score')
Out[103]: