In [1]:
%matplotlib inline
import math
import pytz
import traceback
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle
In [2]:
%run src/data/helper.py
In [3]:
start_time = time.time()
with open('data/parsed/stations_clean.p', 'rb') as f:
stations = pickle.load(f)
with open('data/parsed/readings_clean.p', "rb") as f:
readings = pickle.load(f)
end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)
In [4]:
columns = ['CollNbBikes', 'DewPt', 'DistNbBikes', 'Humidity', 'NbBikes', 'Rain', 'Temp', 'Visibility',
'WindSpeed', 'TimeOfDay']
data = readings.sample(frac=0.005)[columns]
In [5]:
sns.set(style="ticks", color_codes=True)
# takes too long!
g = sns.pairplot(data)
g.savefig('destination_path.eps', format='eps', dpi=1000)
g
Out[5]:
In [3]:
corr = data.corr()
In [4]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
g = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_yticklabels(g.get_yticklabels(), rotation=0)
In [4]:
split_training = lambda df: df[datetime(2016,5,15,0,0,0,0):datetime(2016,6,12,23,59,59,999999)]
split_validation = lambda df: df[datetime(2016,6,13,0,0,0,0):datetime(2016,6,19,23,59,59,999999)]
split_test = lambda df: df[datetime(2016,5,20,0,0,0,0):datetime(2016,6,26,23,59,59,999999)]
In [5]:
def split_datasets(df, station_id):
station_df = df.loc[station_id]
training = split_training(station_df)
validation = split_validation(station_df)
test = split_test(station_df)
return training, validation, test
In [6]:
import sys
def clip_and_round(arr):
arr = np.clip(arr, 0, sys.maxint)
return np.round(arr)
In [7]:
last_data = None
In [8]:
import inspect
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import IntVector, Formula
pandas2ri.activate()
r = robjects.r
base = importr('base')
stats = importr('stats')
mgcv = importr('mgcv')
class GAMRegressor(BaseEstimator, RegressorMixin):
last_data = None
def __init__(self, features=None, formula_str=None,
FogTMinus2_parametric=None, RainTMinus2_parametric=None,
DistNbBikes_parametric=None, CollNbBikes_parametric=None,
TempTMinus2AndHumidityTMinus2_sp=None,
NbBikesTMinus2_sp=None, NbBikesTMinus3_sp=None,
NbBikesTMinus12_sp=None, NbBikesTMinus18_sp=None,
TimeOfDay_1_sp=None, TimeOfDay_2_sp=None, TimeOfDay_3_sp=None,
TimeOfDay_1_by=None, TimeOfDay_2_by=None, TimeOfDay_3_by=None):
args, _, _, values = inspect.getargvalues(inspect.currentframe())
values.pop("self")
self.args_to_set = []
for arg, val in values.items():
# save a list of the arguments
if arg != 'features' and arg != 'formula_str':
self.args_to_set.append(arg)
setattr(self, arg, val)
def fit(self, X, y=None):
if self.formula_str is None:
features_dicts = self.build_features_dicts()
self.formula_str = self.build_formula_str(features_dicts)
GAMRegressor.last_data=X
frm = Formula(self.formula_str)
self.gam = mgcv.gam(frm, data=X)
return self
def predict(self, X):
assert (self.gam is not None), "GAM must be set"
p_val = clip_and_round(stats.predict(self.gam, newdata=X))
return p_val
def score(self, X):
p_val = self.predict(X)
y_val = X.NbBikes
rmse = mean_squared_error(y_val, p_val)**0.5
return rmse * (-1)
def build_features_dicts(self):
assert (self.features is not None), "features must be set"
# initialize the dictionaries
features_dicts = {}
for feature in self.features:
features_dicts[feature] = {
'name': feature,
'bs': 'tp',
'sp': None,
'by': None,
'k': None,
'parametric': False
}
# set parameter values
for arg in self.args_to_set:
val = getattr(self, arg)
if val is None:
continue
feature, parameter = arg.rsplit('_',1)
features_dicts[feature][parameter] = val
return features_dicts
def build_formula_str(self, features_dicts):
formula = 'NbBikes ~ '
for feature, feature_dict in features_dicts.iteritems():
if feature_dict['parametric']:
formula += '%(name)s+' % feature_dict
continue
tokens = feature_dict['name'].split('_')
name, index = (tokens[0],None) if len(tokens) == 1 else (tokens[0], tokens[1])
formula += "s(%s" % name.replace('And', ',')
if feature_dict['bs'] is not None:
formula += ", bs='%s'" % feature_dict['bs']
if feature_dict['sp'] is not None:
formula += ", sp=%f" % feature_dict['sp']
if feature_dict['by'] is not None:
formula += ", by=%s" % feature_dict['by']
if feature_dict['k'] is not None:
formula += ", k=%s" % feature_dict['k']
formula += ")+" % feature_dict
return formula[:-1]
class LRegressor(BaseEstimator, RegressorMixin):
def __init__(self, formula_str):
self.formula_str = formula_str
def fit(self, X, y=None):
self.lr = stats.lm(Formula(self.formula_str), data=X)
return self
def predict(self, X):
assert (self.lr is not None), "LR must be set"
p_val = clip_and_round(stats.predict(self.lr, newdata=X))
return p_val
def score(self, X):
p_val = self.predict(X)
y_val = X.NbBikes
rmse = mean_squared_error(y_val, p_val)**0.5
return rmse * (-1)
In [9]:
from sklearn.linear_model import LinearRegression
def fit_and_predict_lm(training, validation, formula):
lm = LRegressor(formula_str=formula)
lm.fit(training)
return lm, clip_and_round(lm.predict(validation))
In [10]:
def fit_and_predict_gam(training, validation, formula):
gam = GAMRegressor(formula_str=formula)
gam.fit(training)
return gam, clip_and_round(gam.predict(validation))
In [11]:
def model(df, station_ids, gam_formula, lr_formula, pred_col):
results = []
for station_id in station_ids:
print 'Fitting %s' % station_id
training, validation, test = split_datasets(df, station_id)
y_val = validation[pred_col]
try:
# Linear Model
lm_fit = fit_and_predict_lm(training, validation, lr_formula)
lm_rmse = mean_squared_error(y_val, lm_fit[1])**0.5
# GAM Model
gam_fit = fit_and_predict_gam(training, validation, gam_formula)
gam_rmse = mean_squared_error(y_val, gam_fit[1])**0.5
except Exception as e:
logging.error(traceback.format_exc())
results.append({'Id': station_id, 'LR-ERR': lm_rmse, 'GAM-ERR': gam_rmse})
return results
In [12]:
def convert_results_to_df(results, name):
dfs = [pd.DataFrame(result).set_index('Id') for result in results]
for i,df in enumerate(dfs):
df.columns = ['%s-EXP%d-%s' % (name, i,col) for col in df.columns]
return pd.concat(dfs, axis=1)
In [13]:
use_samples = True
if use_samples:
with open("data/parsed/stations_sample.p", "rb") as f:
stations_to_use = pickle.load(f)
else:
stations_to_use = readings.index.get_level_values(0).unique().tolist()
In [ ]:
print 'Saved stations'
In [85]:
gam_formula_short = "NbBikes ~ s(TempTMinus2, HumidityTMinus2, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_short += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus2, bs='tp') "
gam_formula_short += "+ s(NbBikesTMinus3, bs='tp') + RainTMinus2 + FogTMinus2 "
In [86]:
lr_formula_short = "NbBikes ~ TempTMinus2 + HumidityTMinus2 + TimeOfDay + Weekday "
lr_formula_short += "+ NbBikesTMinus2 + NbBikesTMinus3 + RainTMinus2 + FogTMinus2 "
In [87]:
# choose the columns to use in the model
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
'NbBikesTMinus2', 'NbBikesTMinus3']
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]
# remove na
readings_short.dropna(inplace=True)
In [88]:
baseline_short = [model(readings_short, stations_to_use, gam_formula_short, lr_formula_short, 'NbBikes')]
In [89]:
baseline_short_df = convert_results_to_df(baseline_short, 'BASE')
baseline_short_df[['BASE-EXP0-GAM-ERR', 'BASE-EXP0-LR-ERR']].mean()
Out[89]:
In [90]:
with open("data/parsed/baseline_short.p", "wb") as f:
pickle.dump(baseline_short, f)
In [91]:
bicycle_redistribution_experiments = [('CollNbBikes', 'DistNbBikes'), ('CollNbBikesCum2', 'DistNbBikesCum2'), ('CollNbBikesCum6', 'DistNbBikesCum6')]
In [92]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikes' , 'CollNbBikesCum2', 'CollNbBikesCum6', 'DistNbBikes' , 'DistNbBikesCum2', 'DistNbBikesCum6']
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
'NbBikesTMinus2', 'NbBikesTMinus3']
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short + redistribution_cols
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]
# remove na
readings_short.dropna(inplace=True)
In [93]:
redistribution_experiments_short = []
for redistribution_columns in bicycle_redistribution_experiments:
# add redistribution features to formula
gam_formula = gam_formula_short
lr_formula = lr_formula_short
gam_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
lr_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
print '*******************************************************************'
print 'Experiment with column %s and %s' % (redistribution_columns[0],redistribution_columns[1])
print 'GAM Formula: %s' % gam_formula
print 'LR Formula %s' % lr_formula
results = model(readings_short, stations_to_use, gam_formula, lr_formula, 'NbBikes')
redistribution_experiments_short.append(results)
In [94]:
with open("data/parsed/redistribution_short.p", "wb") as f:
pickle.dump(redistribution_experiments_short, f)
In [95]:
surrounding_stations_experiments = [1,2,5,10]
In [96]:
# choose the columns to use in the model
surrounding_stations_cols = ['Near1TMinus2', 'Near1TMinus3',
'Near2TMinus2', 'Near2TMinus3',
'Near3TMinus2', 'Near3TMinus3',
'Near4TMinus2', 'Near4TMinus3',
'Near5TMinus2', 'Near5TMinus3',
'Near6TMinus2', 'Near6TMinus3',
'Near7TMinus2', 'Near7TMinus3',
'Near8TMinus2', 'Near8TMinus3',
'Near9TMinus2', 'Near9TMinus3',
'Near10TMinus2', 'Near10TMinus3']
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
'NbBikesTMinus2', 'NbBikesTMinus3']
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short + surrounding_stations_cols
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]
# remove na
readings_short.dropna(inplace=True)
In [97]:
surrounding_stations_experiments_short = []
for nb_surrounding_stations in surrounding_stations_experiments:
surrounding_stations_features = []
# Generalized Additive Model
gam_formula = gam_formula_short
lr_formula = lr_formula_short
if nb_surrounding_stations > 0:
surrounding_stations_features.extend(['Near%dTMinus2' % near for near in range(1,21)][0:nb_surrounding_stations])
surrounding_stations_features.extend(['Near%dTMinus3' % near for near in range(1,21)][0:nb_surrounding_stations])
for feature in surrounding_stations_features:
gam_formula += "+ s(%s, bs='tp') " % feature
lr_formula += "+ %s " % feature
print '*******************************************************************'
print 'Experiment with %d surrounding stations' % nb_surrounding_stations
print 'GAM Formula: %s' % gam_formula
print 'LR Formula %s' % lr_formula
results = model(readings_short, stations_to_use, gam_formula, lr_formula, 'NbBikes')
surrounding_stations_experiments_short.append(results)
In [98]:
with open("data/parsed/surrounding_short.p", "wb") as f:
pickle.dump(surrounding_stations_experiments_short, f)
In [99]:
# choose the columns to use in the model
surrounding_stations_cols = ['Near1TMinus2', 'Near1TMinus3', 'Near2TMinus2', 'Near2TMinus3']
redistribution_cols = ['CollNbBikes', 'DistNbBikes']
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
'NbBikesTMinus2', 'NbBikesTMinus3']
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short + surrounding_stations_cols + redistribution_cols
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]
# remove na
readings_short.dropna(inplace=True)
In [100]:
gam_formula = gam_formula_short + "+ s(%s, bs='tp') + s(%s, bs='tp') + %s + %s " % ('Near1TMinus2','Near1TMinus3', 'CollNbBikes', 'DistNbBikes')
lr_formula = lr_formula_short + "+ %s + %s + %s + %s " % ('Near1TMinus2', 'Near1TMinus3', 'CollNbBikes', 'DistNbBikes')
In [101]:
all_short = [model(readings_short, stations_to_use, gam_formula, lr_formula, 'NbBikes')]
In [102]:
with open("data/parsed/all_short.p", "wb") as f:
pickle.dump(all_short, f)
In [103]:
with open("data/parsed/stations_sample.p", "rb") as f:
stations_to_use = pickle.load(f)
In [118]:
lr_formula_mid = "NbBikes ~ TempTMinus12 + HumidityTMinus12 + TimeOfDay + Weekday "
lr_formula_mid += "+ NbBikesTMinus12 + NbBikesTMinus18 "
In [14]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') "
In [15]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid
# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]
# remove na
readings_mid.dropna(inplace=True)
In [121]:
baseline_mid = [model(readings_mid, stations_to_use, gam_formula_mid, lr_formula_mid, 'NbBikes')]
In [122]:
with open("data/parsed/baseline_mid.p", "wb") as f:
pickle.dump(baseline_mid, f)
In [123]:
bicycle_redistribution_experiments = [('CollNbBikes', 'DistNbBikes'), ('CollNbBikesCum2', 'DistNbBikesCum2'), ('CollNbBikesCum6', 'DistNbBikesCum6')]
In [124]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikes' , 'CollNbBikesCum2', 'CollNbBikesCum6', 'DistNbBikes' , 'DistNbBikesCum2', 'DistNbBikesCum6']
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid + redistribution_cols
# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]
# remove na
readings_mid.dropna(inplace=True)
In [125]:
redistribution_experiments_mid = []
for redistribution_columns in bicycle_redistribution_experiments:
# add redistribution features to formula
gam_formula = gam_formula_mid
lr_formula = lr_formula_mid
gam_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
lr_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
print '*******************************************************************'
print 'Experiment with column %s and %s' % (redistribution_columns[0],redistribution_columns[1])
print 'GAM Formula: %s' % gam_formula
print 'LR Formula %s' % lr_formula
results = model(readings_mid, stations_to_use, gam_formula, lr_formula, 'NbBikes')
redistribution_experiments_mid.append(results)
In [126]:
with open("data/parsed/redistribution_mid.p", "wb") as f:
pickle.dump(redistribution_experiments_mid, f)
In [127]:
surrounding_stations_experiments = [1,2,5,10]
In [128]:
# choose the columns to use in the model
surrounding_stations_cols = ['Near1TMinus12', 'Near1TMinus18',
'Near2TMinus12', 'Near2TMinus18',
'Near3TMinus12', 'Near3TMinus18',
'Near4TMinus12', 'Near4TMinus18',
'Near5TMinus12', 'Near5TMinus18',
'Near6TMinus12', 'Near6TMinus18',
'Near7TMinus12', 'Near7TMinus18',
'Near8TMinus12', 'Near8TMinus18',
'Near9TMinus12', 'Near9TMinus18',
'Near10TMinus12', 'Near10TMinus18']
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid + surrounding_stations_cols
# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]
# remove na
readings_mid.dropna(inplace=True)
In [129]:
surrounding_stations_experiments_mid = []
for nb_surrounding_stations in surrounding_stations_experiments:
surrounding_stations_features = []
# Generalized Additive Model
gam_formula = gam_formula_mid
lr_formula = lr_formula_mid
if nb_surrounding_stations > 0:
surrounding_stations_features.extend(['Near%dTMinus12' % near for near in range(1,21)][0:nb_surrounding_stations])
surrounding_stations_features.extend(['Near%dTMinus18' % near for near in range(1,21)][0:nb_surrounding_stations])
for feature in surrounding_stations_features:
gam_formula += "+ s(%s, bs='tp') " % feature
lr_formula += "+ %s " % feature
print '*******************************************************************'
print 'Experiment with %d surrounding stations' % nb_surrounding_stations
print 'GAM Formula: %s' % gam_formula
print 'LR Formula %s' % lr_formula
results = model(readings_mid, stations_to_use, gam_formula, lr_formula, 'NbBikes')
surrounding_stations_experiments_mid.append(results)
In [130]:
with open("data/parsed/surrounding_mid.p", "wb") as f:
pickle.dump(surrounding_stations_experiments_mid, f)
In [131]:
# choose the columns to use in the model
surrounding_stations_cols = ['Near1TMinus12', 'Near1TMinus18']
redistribution_cols = ['CollNbBikes', 'DistNbBikes']
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid + surrounding_stations_cols + redistribution_cols
# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]
# remove na
readings_mid.dropna(inplace=True)
In [132]:
gam_formula = gam_formula_mid + "+ s(%s, bs='tp') + s(%s, bs='tp') + %s + %s " % ('Near1TMinus12','Near1TMinus18', 'CollNbBikes', 'DistNbBikes')
lr_formula = lr_formula_mid + "+ %s + %s + %s + %s " % ('Near1TMinus12', 'Near1TMinus18', 'CollNbBikes', 'DistNbBikes')
In [133]:
all_mid = [model(readings_mid, stations_to_use, gam_formula, lr_formula, 'NbBikes')]
In [134]:
with open("data/parsed/all_mid.p", "wb") as f:
pickle.dump(all_mid, f)
In [56]:
with open("data/parsed/stations_sample.p", "rb") as f:
stations_to_use = pickle.load(f)
In [178]:
lr_formula_long = "NbBikes ~ TimeOfDay + Weekday "
In [179]:
gam_formula_long = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp') + s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') "
In [180]:
# choose the columns to use in the model
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]
# remove na
readings_long.dropna(inplace=True)
In [181]:
baseline_long = [model(readings_long, stations_to_use, gam_formula_long, lr_formula_long, 'NbBikes')]
In [182]:
with open("data/parsed/baseline_long.p", "wb") as f:
pickle.dump(baseline_long, f)
In [183]:
bicycle_redistribution_experiments = [('CollNbBikes', 'DistNbBikes'), ('CollNbBikesCum2', 'DistNbBikesCum2'), ('CollNbBikesCum6', 'DistNbBikesCum6')]
In [184]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikes' , 'CollNbBikesCum2', 'CollNbBikesCum6', 'DistNbBikes' , 'DistNbBikesCum2', 'DistNbBikesCum6']
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long + redistribution_cols
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]
# remove na
readings_long.dropna(inplace=True)
In [185]:
redistribution_experiments_long = []
for redistribution_columns in bicycle_redistribution_experiments:
# add redistribution feature to formula
gam_formula = gam_formula_long
lr_formula = lr_formula_long
gam_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
lr_formula += "+ %s + %s " % (redistribution_columns[0],redistribution_columns[1])
print '*******************************************************************'
print 'Experiment with column %s and %s' % (redistribution_columns[0],redistribution_columns[1])
print 'GAM Formula: %s' % gam_formula
print 'LR Formula %s' % lr_formula
results = model(readings_long, stations_to_use, gam_formula, lr_formula, 'NbBikes')
redistribution_experiments_long.append(results)
In [186]:
with open("data/parsed/redistribution_long.p", "wb") as f:
pickle.dump(redistribution_experiments_long, f)
In [187]:
# choose the columns to use in the model
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay', 'HistAvg']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]
# remove na
readings_long.dropna(inplace=True)
In [195]:
gam_formula = gam_formula_long + "+ s(%s, bs='tp', k=5) " % 'HistAvg'
lr_formula = lr_formula_long + "+ %s " % 'HistAvg'
In [196]:
histavg_long = [model(readings_long, stations_to_use, gam_formula, lr_formula, 'NbBikes')]
In [197]:
with open("data/parsed/histavg_long.p", "wb") as f:
pickle.dump(histavg_long, f)
In [199]:
# choose the columns to use in the model
redistribution_cols = ['CollNbBikesCum6', 'DistNbBikesCum6']
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay', 'HistAvg']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long + redistribution_cols
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]
# remove na
readings_long.dropna(inplace=True)
In [200]:
gam_formula = gam_formula_long + "+ s(%s, bs='tp', k=5) + %s + %s " % ('HistAvg', 'CollNbBikesCum6', 'DistNbBikesCum6')
lr_formula = lr_formula_long + "+ %s + %s + %s " % ('HistAvg', 'CollNbBikesCum6', 'DistNbBikesCum6')
In [201]:
all_long = [model(readings_long, stations_to_use, gam_formula, lr_formula, 'NbBikes')]
In [202]:
with open("data/parsed/all_long.p", "wb") as f:
pickle.dump(all_long, f)
In [6]:
with open("data/parsed/baseline_short.p", "rb") as f:
baseline_short = pickle.load(f)
with open("data/parsed/all_short.p", "rb") as f:
all_short = pickle.load(f)
with open("data/parsed/redistribution_short.p", "rb") as f:
redistribution_experiments_short = pickle.load(f)
with open("data/parsed/surrounding_short.p", "rb") as f:
surrounding_stations_experiments_short = pickle.load(f)
In [108]:
baseline_short_df = convert_results_to_df(baseline_short, 'BASE')
baseline_short_df[['BASE-EXP0-GAM-ERR', 'BASE-EXP0-LR-ERR']].mean()
Out[108]:
In [109]:
all_short_df = convert_results_to_df(all_short, 'ALL')
all_short_df[['ALL-EXP0-GAM-ERR', 'ALL-EXP0-LR-ERR']].mean()
Out[109]:
In [110]:
redistribution_short_df = convert_results_to_df(redistribution_experiments_short, 'SH-RED')
redistribution_short_df[['SH-RED-EXP0-GAM-ERR', 'SH-RED-EXP0-LR-ERR',
'SH-RED-EXP1-GAM-ERR', 'SH-RED-EXP1-LR-ERR',
'SH-RED-EXP2-GAM-ERR', 'SH-RED-EXP2-LR-ERR']].mean()
Out[110]:
In [111]:
surrounding_short_df = convert_results_to_df(surrounding_stations_experiments_short, 'SH-SURR')
surrounding_short_df = surrounding_short_df[['SH-SURR-EXP0-GAM-ERR', 'SH-SURR-EXP0-LR-ERR',
'SH-SURR-EXP1-GAM-ERR', 'SH-SURR-EXP1-LR-ERR',
'SH-SURR-EXP2-GAM-ERR', 'SH-SURR-EXP2-LR-ERR',
'SH-SURR-EXP3-GAM-ERR', 'SH-SURR-EXP3-LR-ERR']]
surrounding_short_df.mean()
Out[111]:
In [136]:
short_results = baseline_short_df.merge(all_short_df, how='inner', right_index=True, left_index=True)
short_results = short_results.merge(redistribution_short_df, how='inner', right_index=True, left_index=True)
short_results = short_results.merge(surrounding_short_df, how='inner', right_index=True, left_index=True)
short_results.columns = ['BASE-GAM', 'BASE-LR',
'ALL-GAM', 'ALL-LR',
'RED-GAM-NOW', 'RED-LR-NOW',
'RED-GAM-CUM2', 'RED-LR-CUM2',
'RED-GAM-CUM6', 'RED-LR-CUM6',
'SURR-GAM-NEAR1', 'SURR-LR-NEAR1',
'SURR-GAM-NEAR2', 'SURR-LR-NEAR2',
'SURR-GAM-NEAR5', 'SURR-LR-NEAR5',
'SURR-GAM-NEAR10', 'SURR-LR-NEAR10']
In [137]:
short_results.mean()['ALL-LR'] - short_results.mean()['BASE-GAM']
Out[137]:
In [140]:
ax = sns.barplot(data=short_results, palette=sns.xkcd_palette(["windows blue"]))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_ylabel('Average RMSE')
ax.set_xlabel('Experiment')
ax.set_ylim((0.9, 1.25))
ax.set_title('Feature Selection Experiments in Short-Term Predictions')
plt.savefig('reports/images/features-short-results.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [154]:
temp = short_results.describe().loc[['mean', 'std']].unstack().groupby(level=0).aggregate(lambda x: tuple(x))
temp = temp.apply(lambda x: '%.3f (%.3f)' % (x[0], x[1])).rename('AVG_RMSE').to_frame().sort_values(by=['AVG_RMSE'])
temp
Out[154]:
In [155]:
print temp.to_latex()
In [17]:
with open("data/parsed/baseline_mid.p", "rb") as f:
baseline_mid = pickle.load(f)
with open("data/parsed/all_mid.p", "rb") as f:
all_mid = pickle.load(f)
with open("data/parsed/redistribution_mid.p", "rb") as f:
redistribution_experiments_mid = pickle.load(f)
with open("data/parsed/surrounding_mid.p", "rb") as f:
surrounding_stations_experiments_mid = pickle.load(f)
In [18]:
baseline_mid_df = convert_results_to_df(baseline_mid, 'MID-BASE')
baseline_mid_df[['MID-BASE-EXP0-GAM-ERR', 'MID-BASE-EXP0-LR-ERR']].mean()
Out[18]:
In [19]:
all_mid_df = convert_results_to_df(all_mid, 'MID-ALL')
all_mid_df[['MID-ALL-EXP0-GAM-ERR', 'MID-ALL-EXP0-LR-ERR']].mean()
Out[19]:
In [20]:
redistribution_mid_df = convert_results_to_df(redistribution_experiments_mid, 'MID-RED')
redistribution_mid_df[['MID-RED-EXP0-GAM-ERR', 'MID-RED-EXP0-LR-ERR',
'MID-RED-EXP1-GAM-ERR', 'MID-RED-EXP1-LR-ERR',
'MID-RED-EXP2-GAM-ERR', 'MID-RED-EXP2-LR-ERR']].mean()
Out[20]:
In [21]:
surrounding_mid_df = convert_results_to_df(surrounding_stations_experiments_mid, 'MID-SURR')
surrounding_mid_df[['MID-SURR-EXP0-GAM-ERR', 'MID-SURR-EXP0-LR-ERR',
'MID-SURR-EXP1-GAM-ERR', 'MID-SURR-EXP1-LR-ERR',
'MID-SURR-EXP2-GAM-ERR', 'MID-SURR-EXP2-LR-ERR',
'MID-SURR-EXP3-GAM-ERR', 'MID-SURR-EXP3-LR-ERR']].mean()
Out[21]:
In [22]:
mid_results = baseline_mid_df.merge(all_mid_df, how='inner', right_index=True, left_index=True)
mid_results = mid_results.merge(redistribution_mid_df, how='inner', right_index=True, left_index=True)
mid_results = mid_results.merge(surrounding_mid_df, how='inner', right_index=True, left_index=True)
mid_results.columns = ['BASE-GAM', 'BASE-LR',
'ALL-GAM', 'ALL-LR',
'RED-GAM-NOW', 'RED-LR-NOW',
'RED-GAM-CUM2', 'RED-LR-CUM2',
'RED-GAM-CUM6', 'RED-LR-CUM6',
'SURR-GAM-NEAR1', 'SURR-LR-NEAR1',
'SURR-GAM-NEAR2', 'SURR-LR-NEAR2',
'SURR-GAM-NEAR5', 'SURR-LR-NEAR5',
'SURR-GAM-NEAR10', 'SURR-LR-NEAR10']
In [23]:
ax = sns.barplot(data=mid_results, palette=sns.xkcd_palette(["windows blue"]))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_ylabel('Average RMSE')
ax.set_xlabel('Experiment')
ax.set_ylim((2.2, 3.2))
ax.set_title('Feature Selection Experiments in Mid-Term Predictions')
plt.savefig('reports/images/features-mid-results.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [24]:
mid_results.mean()['ALL-LR'] - mid_results.mean()['BASE-GAM']
Out[24]:
In [25]:
mid_results.describe()
Out[25]:
In [156]:
temp = mid_results.describe().loc[['mean', 'std']].unstack().groupby(level=0).aggregate(lambda x: tuple(x))
temp = temp.apply(lambda x: '%.3f (%.3f)' % (x[0], x[1])).rename('AVG_RMSE').to_frame()
temp.sort_values(by=['AVG_RMSE'])
Out[156]:
In [157]:
print temp.to_latex()
In [235]:
with open("data/parsed/baseline_long.p", "rb") as f:
baseline_long = pickle.load(f)
with open("data/parsed/redistribution_long.p", "rb") as f:
redistribution_experiments_long = pickle.load(f)
with open("data/parsed/histavg_long.p", "rb") as f:
histavg_long = pickle.load(f)
with open("data/parsed/all_long.p", "rb") as f:
all_long = pickle.load(f)
In [236]:
baseline_long_df = convert_results_to_df(baseline_long, 'LONG-BASE')
baseline_long_df[['LONG-BASE-EXP0-GAM-ERR', 'LONG-BASE-EXP0-LR-ERR']].mean()
Out[236]:
In [237]:
all_long_df = convert_results_to_df(all_long, 'LONG-ALL')
all_long_df[['LONG-ALL-EXP0-GAM-ERR', 'LONG-ALL-EXP0-LR-ERR']].mean()
Out[237]:
In [238]:
redistribution_long_df = convert_results_to_df(redistribution_experiments_long, 'LONG-RED')
redistribution_long_df[['LONG-RED-EXP0-GAM-ERR', 'LONG-RED-EXP0-LR-ERR',
'LONG-RED-EXP1-GAM-ERR', 'LONG-RED-EXP1-LR-ERR',
'LONG-RED-EXP2-GAM-ERR', 'LONG-RED-EXP2-LR-ERR']].mean()
Out[238]:
In [239]:
histavg_long_df = convert_results_to_df(histavg_long, 'LONG-AVG')
histavg_long_df[['LONG-AVG-EXP0-GAM-ERR', 'LONG-AVG-EXP0-LR-ERR']].mean()
Out[239]:
In [240]:
long_results = baseline_long_df.merge(all_long_df, how='inner', right_index=True, left_index=True)
long_results = long_results.merge(redistribution_long_df, how='inner', right_index=True, left_index=True)
long_results = long_results.merge(histavg_long_df, how='inner', right_index=True, left_index=True)
long_results.columns = ['BASE-GAM', 'BASE-LR',
'ALL-GAM', 'ALL-LR',
'RED-GAM-NOW', 'RED-LR-NOW',
'RED-GAM-CUM2', 'RED-LR-CUM2',
'RED-GAM-CUM6', 'RED-LR-CUM6',
'HAVG-GAM', 'HAVG-LR']
In [245]:
long_results.describe()
Out[245]:
In [242]:
long_results.mean()['BASE-LR'] - long_results.mean()['BASE-GAM']
Out[242]:
In [244]:
ax = sns.barplot(data=long_results, palette=sns.xkcd_palette(["windows blue"]))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_ylabel('Average RMSE')
ax.set_xlabel('Experiment')
ax.set_ylim((5.0, 7.8))
ax.set_title('Feature Selection Experiments in Long-Term Predictions')
plt.savefig('reports/images/features-long-results.eps', format='eps', dpi=1000, bbox_inches='tight', pad_inches=0)
In [246]:
temp = long_results.describe().loc[['mean', 'std']].unstack().groupby(level=0).aggregate(lambda x: tuple(x))
temp = temp.apply(lambda x: '%.3f (%.3f)' % (x[0], x[1])).rename('AVG_RMSE').to_frame()
temp.sort_values(by=['AVG_RMSE'])
Out[246]:
In [247]:
print temp.to_latex()
In [280]:
mid_results['ALL-GAM'].sort_values()[49:50]
Out[280]:
In [281]:
training, validation, test = split_datasets(readings_long, 'BikePoints_240')
In [282]:
%load_ext rpy2.ipython
robjects.globalenv['data'] = training
In [284]:
%%R
library(mgcv)
gamModel <- gam(NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp')
+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp')
+ s(NbBikesTMinus18, bs='tp'), data=data)
summary(gamModel)
In [ ]:
In [ ]: