In [1]:
%run src/data/visualization.py
In [4]:
%matplotlib inline
import math
import pytz
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
In [5]:
import pickle
statistics = pickle.load(open('data/parsed/stations_statistics.p', 'rb'))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open("data/parsed/readings_full.p", "rb"))
In [6]:
columns = ['CollNbBikes', 'DewPt', 'DistNbBikes', 'Humidity', 'NbBikes', 'Rain', 'Temp', 'Visibility',
'WindSpeed', 'TimeOfYear', 'TimeOfDay']
#columns = ['NbBikes', 'Rain', 'Temp']
data = readings[columns]
In [ ]:
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(data)
In [ ]:
corr = data.corr()
In [ ]:
import matplotlib.pyplot as plt
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True, xticklabels=2, yticklabels=2,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
In [7]:
readings['NAB'] = readings.NbBikes / (readings.NbBikes + readings.NbEmptyDocks)
readings['NAS'] = (readings.NbBikesTMinus1 - readings.NbBikes).apply(math.fabs) / (readings.NbBikes + readings.NbEmptyDocks)
In [8]:
dayviews = readings.query('Holiday == 0')[['NAB', 'NAS', 'TimeOfDay','Weekday']].dropna().reset_index().groupby(['Weekday', 'Id', 'TimeOfDay']).mean()
In [9]:
%run src/data/visualization.py
%run src/data/helper.py
In [10]:
from sklearn import preprocessing
NAS_weekend = dayviews.loc[0].groupby(level='Id').mean().NAS
NAS_weekend = pd.Series(preprocessing.MinMaxScaler().fit_transform(NAS_weekend.values.reshape(-1, 1)).reshape(len(NAS_weekend)), index=NAS_weekend.index)
NAS_weekend = add_station_info(pd.DataFrame(NAS_weekend, columns=['NAS']), stations.set_index('Id'), use_indexes=True)
NAS_weekday = dayviews.loc[1].groupby(level='Id').mean().NAS
NAS_weekday = pd.Series(preprocessing.MinMaxScaler().fit_transform(NAS_weekday.values.reshape(-1, 1)).reshape(len(NAS_weekday)), index=NAS_weekday.index)
NAS_weekday = add_station_info(pd.DataFrame(NAS_weekday, columns=['NAS']), stations.set_index('Id'), use_indexes=True)
In [34]:
import matplotlib.pyplot as plt
def nas_marker(station):
color = cmap_to_hex(plt.get_cmap('autumn_r'), station.NAS)
label = "%s - %s" % (station.name, station['Name'])
return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=75,
popup=label, fill_color=color, color=color, fill_opacity=0.9)
In [35]:
NAS_weekday_map = draw_stations_map(NAS_weekday, nas_marker)
folium.Map.save(NAS_weekday_map, 'reports/maps/weekday_activity.html')
NAS_weekday_map
Out[35]:
In [36]:
NAS_weekend_map = draw_stations_map(NAS_weekend, nas_marker)
folium.Map.save(NAS_weekend_map, 'reports/maps/weekends_activity.html')
NAS_weekend_map
Out[36]:
In [72]:
def get_nab(weekday, hour):
nab = dayviews2.loc[weekday].xs((hour - 1)* 3600, level='TimeOfDay')[['NAB']]
nab.NAB = preprocessing.MinMaxScaler().fit_transform(nab.values.reshape(-1, 1))
return add_station_info(nab, stations.set_index('Id'), use_indexes=True)
def nab_marker(station):
color = cmap_to_hex(plt.get_cmap('autumn_r'), station.NAB)
label = "%s - %s" % (station.name, station['Name'])
return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=75,
popup=label, fill_color=color, color=color, fill_opacity=0.9)
In [69]:
dayviews2 = dayviews.reset_index()
dayviews2.TimeOfDay = dayviews2.TimeOfDay.astype(int)
dayviews2 = dayviews2.set_index(['Weekday', 'Id', 'TimeOfDay'])
In [73]:
NAB_weekday_7_map = draw_stations_map(get_nab(1,7), nab_marker)
folium.Map.save(NAB_weekday_7_map, 'reports/maps/weekday_availability_7.html')
NAB_weekday_7_map
Out[73]:
In [74]:
NAB_weekday_10_map = draw_stations_map(get_nab(1,10), nab_marker)
folium.Map.save(NAB_weekday_10_map, 'reports/maps/weekday_availability_10.html')
NAB_weekday_10_map
Out[74]:
In [75]:
NAB_weekday_16_map = draw_stations_map(get_nab(1,16), nab_marker)
folium.Map.save(NAB_weekday_16_map, 'reports/maps/weekday_availability_16.html')
NAB_weekday_16_map
Out[75]:
In [76]:
NAB_weekday_19_map = draw_stations_map(get_nab(1,19), nab_marker)
folium.Map.save(NAB_weekday_19_map, 'reports/maps/weekday_availability_19.html')
NAB_weekday_19_map
Out[76]:
In [ ]:
NAB_weekend_11_map = draw_stations_map(get_nab(0,11), nab_marker)
folium.Map.save(NAB_weekend_11_map, 'reports/maps/weekend_availability_11.html')
NAB_weekend_11_map
In [ ]:
NAB_weekend_13_map = draw_stations_map(get_nab(0,13), nab_marker)
folium.Map.save(NAB_weekend_13_map, 'reports/maps/weekend_availability_13.html')
NAB_weekend_13_map
In [ ]:
NAB_weekend_15_map = draw_stations_map(get_nab(0,15), nab_marker)
folium.Map.save(NAB_weekend_15_map, 'reports/maps/weekend_availability_15.html')
NAB_weekend_15_map
In [ ]:
NAB_weekend_19_map = draw_stations_map(get_nab(0,19), nab_marker)
folium.Map.save(NAB_weekend_19_map, 'reports/maps/weekend_availability_15.html')
NAB_weekend_19_map
In [ ]:
from matplotlib.ticker import FuncFormatter
timeofday_formatter = FuncFormatter(lambda x, pos: x / 3600)
station_id = 'BikePoints_374'
In [ ]:
ax = pd.concat([dayviews.loc[0, station_id].NAB.rename('Weekends'),
dayviews.loc[1, station_id].NAB.rename('Weekdays')],
axis=1).plot(xticks = range(7200, 25 * 3600, 14400), xlim=(21600, 86400))
ax.xaxis.set_major_formatter(timeofday_formatter)
In [ ]:
ax = pd.concat([dayviews.loc[0, station_id].NAS.rename('Weekends'),
dayviews.loc[1, station_id].NAS.rename('Weekdays')],
axis=1).plot(xticks = range(7200, 25 * 3600, 14400), xlim=(21600, 86400))
ax.xaxis.set_major_formatter(timeofday_formatter)
In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAB', unit='Id', condition='Weekday', data=dayviews.reset_index())
# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')
In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAS', unit='Id', condition='Weekday', data=dayviews.reset_index())
# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')
In [ ]:
priority1_ids = stations.query('Priority == "1"').Id
priority2_ids = stations.query('Priority == "2"').Id
priority3_ids = stations.query('(Priority != Priority)').Id
In [ ]:
dayviews1 = dayviews.loc[1].sort_index().loc[priority1_ids.values.tolist()]
dayviews2 = dayviews.reset_index(level=0).sort_index().loc[priority2_ids.values.tolist()]
dayviews3 = dayviews.reset_index(level=0).sort_index().loc[priority3_ids.values.tolist()]
In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAS', unit='Id', condition='Weekday', data=dayviews3.reset_index())
# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')
In [6]:
from cdtw import cdtw_sakoe_chiba
def dtw_sakoe_chiba(a,b):
return cdtw_sakoe_chiba(a, b, 12)
In [7]:
data = dayviews.unstack().loc[1].NAS
In [8]:
from scipy.spatial.distance import pdist,squareform
dist_condensed = pdist(data.values, dtw_sakoe_chiba)
dist_matrix = squareform(dist_condensed)
In [9]:
from sklearn.cluster import AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="complete")
clusters = pd.Series(agg_clustering.fit_predict(dist_matrix), index=data.index)
In [10]:
clusters.value_counts()
Out[10]:
In [13]:
data = add_station_info(clusters.to_frame('Cluster'), stations.set_index('Id'), use_indexes=True)
data.query('Priority == "1"').Cluster.value_counts()
Out[13]:
In [20]:
clusters_df = dayviews.unstack().loc[1].NAS.copy()
clusters_df['Cluster'] = clusters
clusters_df = clusters_df[['Cluster']]
clusters_df.Cluster.nunique()
Out[20]:
In [ ]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
for col in statistics.columns.difference(['Latitude', 'Longitude', 'Priority']):
std_col = '%sS' % col
statistics[std_col] = min_max_scaler.fit_transform(statistics[col].values.reshape(-1, 1))
In [ ]:
from sklearn.cluster import KMeans
statistics = statistics.sort_values(by=['Priority'])
priority_clusters = [(3,1), (2,2), (2,3)]
cluster_cols = ['EmptyEveningPeakS', 'EmptyMorningPeakS', 'EmptyNonPeakS',
'FullEveningPeakS', 'FullMorningPeakS', 'FullNonPeakS',
'CountS']
clusters = []
offset = 0
for cls_prior in priority_clusters:
n_clusters, priority = cls_prior
window = statistics[statistics.Priority == priority][cluster_cols]
p_clusters = KMeans(n_clusters=n_clusters).fit_predict(window.values)
clusters.extend(p_clusters + offset)
offset += n_clusters
statistics['Cluster'] = clusters
In [ ]:
statistics.Cluster.hist(bins=7)
In [ ]:
Dark2_7.show_discrete_image()
In [ ]:
draw_stations_map(statistics, create_cluster_marker('Cluster'))
In [ ]:
split_training = lambda df: df[datetime(2016,5,15,0,0,0,0):datetime(2016,6,12,23,59,59,999999)]
split_validation = lambda df: df[datetime(2016,6,13,0,0,0,0):datetime(2016,6,19,23,59,59,999999)]
split_test = lambda df: df[datetime(2016,5,20,0,0,0,0):datetime(2016,6,26,23,59,59,999999)]
In [ ]:
def split_datasets(df, station_id):
station_df = df.loc[station_id]
training = split_training(station_df)
validation = split_validation(station_df)
test = split_test(station_df)
return training, validation, test
In [ ]:
import sys
def clip_and_round(arr):
arr = np.clip(arr, 0, sys.maxint)
return np.round(arr)
In [ ]:
import inspect
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import IntVector, Formula
pandas2ri.activate()
r = robjects.r
base = importr('base')
stats = importr('stats')
mgcv = importr('mgcv')
class GAMRegressor(BaseEstimator, RegressorMixin):
"""An example of classifier"""
def __init__(self, features=None, formula_str=None,
Fog_parametric=None, Rain_parametric=None, TempAndHumidity_k=None,
TMinus1_k=None, TMinus2_k=None,
TMinus6_k=None, TMinus12_k=None,
TimeOfYear_k=None,
TimeOfDay_1_k=None, TimeOfDay_2_k=None, TimeOfDay_3_k=None,
TimeOfDay_1_bs=None, TimeOfDay_2_bs=None, TimeOfDay_3_bs=None,
TimeOfDay_1_by=None, TimeOfDay_2_by=None, TimeOfDay_3_by=None):
args, _, _, values = inspect.getargvalues(inspect.currentframe())
values.pop("self")
self.args_to_set = []
for arg, val in values.items():
# save a list of the arguments
if arg != 'features' and arg != 'formula_str':
self.args_to_set.append(arg)
setattr(self, arg, val)
def fit(self, X, y=None):
if self.formula_str is None:
features_dicts = self.build_features_dicts()
self.formula_str = self.build_formula_str(features_dicts)
#print self.formula_str
self.gam = mgcv.gam(Formula(self.formula_str), data=X)
return self
def predict(self, X):
assert (self.gam is not None), "GAM must be set"
p_val = clip_and_round(stats.predict(self.gam, newdata=X))
return p_val
def score(self, X):
p_val = self.predict(X)
y_val = X.NbBikes
rmse = mean_squared_error(y_val, p_val)**0.5
return rmse * (-1)
def build_features_dicts(self):
assert (self.features is not None), "features must be set"
# initialize the dictionaries
features_dicts = {}
for feature in self.features:
features_dicts[feature] = {
'name': feature,
'bs': 'tp',
'sp': None,
'by': None,
'k': None,
'parametric': False
}
# set parameter values
for arg in self.args_to_set:
val = getattr(self, arg)
if val is None:
continue
feature, parameter = arg.rsplit('_',1)
features_dicts[feature][parameter] = val
return features_dicts
def build_formula_str(self, features_dicts):
formula = 'NbBikes ~ '
for feature, feature_dict in features_dicts.iteritems():
if feature_dict['parametric']:
formula += '%(name)s+' % feature_dict
continue
tokens = feature_dict['name'].split('_')
name, index = (tokens[0],None) if len(tokens) == 1 else (tokens[0], tokens[1])
formula += "s(%s" % name.replace('And', ',')
if feature_dict['bs'] is not None:
formula += ", bs='%s'" % feature_dict['bs']
if feature_dict['sp'] is not None:
formula += ", sp=%0.3f" % feature_dict['sp']
if feature_dict['by'] is not None:
formula += ", by=%s" % feature_dict['by']
if feature_dict['k'] is not None:
formula += ", k=%s" % feature_dict['k']
formula += ")+" % feature_dict
return formula[:-1]
In [ ]:
from sklearn.linear_model import LinearRegression
def fit_and_predict_lm(X_train, y_train, X_val):
lm = LinearRegression()
lm.fit(X_train, y_train)
return lm, clip_and_round(lm.predict(X_val))
In [ ]:
def fit_and_predict_gam(training, validation, formula):
gam = GAMRegressor(formula_str=formula)
gam.fit(training)
return gam, clip_and_round(gam.predict(validation))
In [ ]:
def model(df, station_ids, gam_formula, feature_cols, pred_col):
results = []
for station_id in station_ids:
print 'Fitting %s' % station_id
training, validation, test = split_datasets(df, station_id)
X_train, y_train = training[feature_cols], training[pred_col]
X_val, y_val = validation[feature_cols], validation[pred_col]
try:
# Linear Model
lm_fit = fit_and_predict_lm(X_train, y_train, X_val)
lm_rmse = mean_squared_error(y_val, lm_fit[1])**0.5
# GAM Model
gam_fit = fit_and_predict_gam(training, validation, gam_formula)
gam_rmse = mean_squared_error(y_val, gam_fit[1])**0.5
except:
print 'Error in station %s' % station_id
results.append({'Id': station_id, 'LM': lm_rmse, 'GAM': gam_rmse, 'LM_p': lm_fit[1], 'GAM_p': gam_fit[1]})
return results
In [ ]:
# choose the columns to use in the model
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus1', 'FogTMinus1']
numeric_cols_short = ['HumidityTMinus1', 'TempTMinus1', 'TimeOfDay', 'NbBikesTMinus1', 'NbBikesTMinus2']
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings[cols_short].copy()
# remove na
readings_short.dropna(inplace=True)
In [ ]:
gam_formula_short = "NbBikes ~ s(TempTMinus1, HumidityTMinus1, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_short += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus1, bs='tp') "
gam_formula_short += "+ s(NbBikesTMinus2, bs='tp') + RainTMinus1 + FogTMinus1 "
station_ids = readings_short.index.get_level_values('Id').unique()
results_short = model(readings_short, station_ids, gam_formula_short, feature_cols_short, pred_col_short)
In [ ]:
%run src/data/visualization.py
In [ ]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
df = add_station_info(pd.DataFrame(results_short)[['Id', 'GAM', 'LM']], stations)
df['Priority'] = df.Priority.fillna(0).astype('int16')
df['GAMS'] = min_max_scaler.fit_transform(df['GAM'].values.reshape(-1, 1))
df['LMS'] = min_max_scaler.fit_transform(df['LM'].values.reshape(-1, 1))
In [ ]:
draw_stations_map(df, create_result_marker('GAMS'))
In [ ]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid
# select the columns chosen columns
readings_mid = readings[cols_mid].copy()
# remove na
readings_mid.dropna(inplace=True)
In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') "
station_ids = readings_mid.index.get_level_values('Id').unique()
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)
In [ ]:
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()
In [ ]:
# choose the columns to use in the model
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings[cols_long].copy()
# remove na
readings_long.dropna(inplace=True)
In [ ]:
gam_formula_long = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_long += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') "
station_ids = readings_long.index.get_level_values('Id').unique()
results_long = model(readings_long, station_ids, gam_formula_long, feature_cols_long, pred_col_long)
In [ ]:
pd.DataFrame(results_long)[['Id', 'GAM', 'LM']].mean()
from sklearn.grid_search import GridSearchCV
training, validation, test = split_datasets(readings_mid, 'BikePoints_101') search_dataset = pd.concat([training, validation])
features = ['TempAndHumidity', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'TMinus6', 'TMinus12', 'Rain', 'Fog']
parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'], 'Fog_parametric': [True], 'Rain_parametric': [True], 'TMinus6_k': [3,4,5], 'TMinus12_k': [6,7,8], 'TempAndHumidity_k': [15,16,17], 'TimeOfDay_1_k': [7,8,9], 'TimeOfDay_2_k': [6,7,8], 'TimeOfDay_3_k': [7,8,9], 'TimeOfDay_1_bs': ['cc', 're', 'tp'], 'TimeOfDay_2_bs': ['cc', 're', 'tp'], 'TimeOfDay_3_bs': ['cc', 're', 'tp'],
#'TMinus6_sp': np.arange(0,1,0.2), 'TMinus12_sp': np.arange(0,1,0.2),
#'TimeOfDay_1_sp': np.arange(0,1,0.2), 'TimeOfDay_2_sp': np.arange(0,1,0.2), 'TimeOfDay_3_sp': np.arange(0,1,0.2),
'features': [features]}]
clf = GridSearchCV(GAMRegressor(), parameters, cv=4) clf.fit(training)
print 'Best parameters set found on dev set:' print clf.bestparams
print 'Grid scores on development set:' for params, mean_score, scores in clf.gridscores: print '%0.3f (+/-%0.03f) for %r' % (mean_score, scores.std() * 2, params)
print 'The model is trained on the full development set.' print 'The scores are computed on the full evaluation set.' p_test = clf.predict(validation) t_test = validation.NbBikes rmse = mean_squared_error(t_test, p_test)**0.5 rmse
In [ ]:
%load_ext rpy2.ipython
In [ ]:
# choose the columns to use in the model
pred_col_mid = 'NbBikes'
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18',
'DistNbBikes', 'CollNbBikes']
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid
# select the columns chosen columns
readings_mid = readings[cols_mid].copy()
In [ ]:
readings_mid.DistNbBikes = readings_mid.DistNbBikes.fillna(method='ffill', limit=0).fillna(0)
readings_mid.CollNbBikes = readings_mid.CollNbBikes.fillna(method='ffill', limit=0).fillna(0)
# remove na
readings_mid.dropna(inplace=True)
In [ ]:
training, validation, test = split_datasets(readings_mid, 'BikePoints_374')
robjects.globalenv['training'] = training
robjects.globalenv['validation'] = validation
In [ ]:
%%R
library(mgcv)
gamModel <- mgcv::gam(NbBikes ~
s(TempTMinus12, HumidityTMinus12, bs='tp')
+ s(TimeOfDay, by=Weekday, bs='cc')
+ s(TimeOfDay, by=Weekend, bs='cc')
+ s(TimeOfDay, by=Holiday, bs='cc')
+ s(NbBikesTMinus12, bs='tp')
+ s(NbBikesTMinus18, bs='tp')
+ DistNbBikes + CollNbBikes
,data=training)
print(summary(gamModel))
p_val <- predict(gamModel, newdata=validation)
error <- validation$NbBikes - p_val
rmse <- function(error)
{
sqrt(mean(error^2))
}
print(rmse(error))
layout(matrix(1:3, ncol = 3))
plot(gamModel, scale = 0)
layout(1)
#layout(matrix(1:3, ncol = 3))
#acf(resid(gamModel), lag.max = 36, main = "ACF")
#pacf(resid(gamModel), lag.max = 36, main = "pACF")
#layout(1)
#gamModel$lme
In [ ]:
start_time = time.time()
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
distributed.sort_values(by=['Id', 'Timestamp'], inplace=True)
distributed = distributed.query('NbBikes != 0')
distributed.set_index(['Id', 'Timestamp'], inplace=True)
distributed.drop(['ShortName', 'Name'], axis=1, inplace=True)
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))
collected.sort_values(by=['Id', 'Timestamp'], inplace=True)
collected = collected.query('NbBikes != 0')
collected.set_index(['Id', 'Timestamp'], inplace=True)
collected.drop(['ShortName', 'Name'], axis=1, inplace=True)
end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)
In [ ]:
station_ids = readings.index.get_level_values('Id').unique().tolist()
station_ids = station_ids[0:100]
stations_ids = ['BikePoints_374']
In [ ]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid
# select the columns chosen columns
readings_mid = readings[cols_mid].copy()
# remove na
readings_mid.dropna(inplace=True)
readings_mid = readings_mid.loc[stations_ids]
In [ ]:
dist_sum = np.zeros(len(readings_mid))
dist_events = np.zeros(len(readings_mid))
coll_sum = np.zeros(len(readings_mid))
coll_events = np.zeros(len(readings_mid))
period = 1
i = 0
for idx, row in readings_mid.iterrows():
station_id, end = idx
start = end - pd.Timedelta(minutes=5*period)
if station_id in distributed.index:
distributed_window = distributed.loc[station_id].loc[start:end]
dist_sum[i] = distributed_window.sum()
dist_events[i] = distributed_window.count()
if station_id in collected.index:
collected_window = collected.loc[station_id].loc[start:end]
coll_sum[i] = collected_window.sum()
coll_events[i] = collected_window.count()
i+=1
readings_mid['DistSumTMinus6'] = dist_sum
readings_mid['DistEvtTMinus6'] = dist_events
readings_mid['CollSumTMinus6'] = coll_sum
readings_mid['CollEvtTMinus6'] = coll_events
In [ ]:
readings_mid.to_csv('test.csv')
print 'a'
In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp')"
station_ids = readings_mid.index.get_level_values('Id').unique()
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()
In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistSumTMinus6 + CollSumTMinus6 "
station_ids = readings_mid.index.get_level_values('Id').unique()
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()
In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistEvtTMinus6 + CollEvtTMinus6"
station_ids = readings_mid.index.get_level_values('Id').unique()
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()
In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistSumTMinus6 + CollSumTMinus6 + DistEvtTMinus6 + CollEvtTMinus6"
station_ids = readings_mid.index.get_level_values('Id').unique()
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()