In [1]:
    
%matplotlib inline
import math
import pytz 
import traceback
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import cPickle as pickle
    
In [2]:
    
%run src/data/helper.py
    
In [3]:
    
start_time = time.time()
with open('data/parsed/stations_dataset_final.p', 'rb') as f:
    stations = pickle.load(f)
with open('data/parsed/readings_clean.p', "rb") as f:
    readings = pickle.load(f)
end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)
    
    
In [4]:
    
split_training = lambda df: df[datetime(2016,5,15,0,0,0,0):datetime(2016,6,12,23,59,59,999999)]
split_validation = lambda df: df[datetime(2016,6,13,0,0,0,0):datetime(2016,6,19,23,59,59,999999)]
split_test = lambda df: df[datetime(2016,5,20,0,0,0,0):datetime(2016,6,26,23,59,59,999999)]
    
In [5]:
    
def split_datasets(df, station_id):
    station_df = df.loc[station_id]
    training = split_training(station_df)
    validation = split_validation(station_df)
    test = split_test(station_df)
    
    return training, validation, test
    
In [6]:
    
import sys
def clip_and_round(arr):
    arr = np.clip(arr, 0, sys.maxint)
    return np.round(arr)
    
In [7]:
    
import inspect
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import IntVector, Formula
pandas2ri.activate()
r = robjects.r
base = importr('base')
stats = importr('stats')
mgcv = importr('mgcv')
class GAMRegressor(BaseEstimator, RegressorMixin):      
    last_data = None
    def __init__(self, features=None, formula_str=None,
                 FogTMinus2_parametric=None, RainTMinus2_parametric=None,
                 DistNbBikes_parametric=None, CollNbBikes_parametric=None, 
                 TempTMinus2AndHumidityTMinus2_sp=None,
                 NbBikesTMinus2_sp=None, NbBikesTMinus3_sp=None, 
                 NbBikesTMinus12_sp=None, NbBikesTMinus18_sp=None,             
                 TimeOfDay_1_sp=None, TimeOfDay_2_sp=None, TimeOfDay_3_sp=None,
                 TimeOfDay_1_by=None, TimeOfDay_2_by=None, TimeOfDay_3_by=None):
        
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        self.args_to_set = []
        for arg, val in values.items():
            # save a list of the arguments
            if arg != 'features' and arg != 'formula_str':
                self.args_to_set.append(arg)
            setattr(self, arg, val)
    def fit(self, X, y=None): 
        if self.formula_str is None:
            features_dicts = self.build_features_dicts()
            self.formula_str = self.build_formula_str(features_dicts)       
            
        GAMRegressor.last_data=X
            
        frm = Formula(self.formula_str)
        self.gam = mgcv.gam(frm, data=X)
        
        return self
    def predict(self, X):
        assert (self.gam is not None), "GAM must be set"
        p_val = clip_and_round(stats.predict(self.gam, newdata=X))
        return p_val
    
    def score(self, X):
        p_val = self.predict(X)
        y_val = X.NbBikes
        rmse = mean_squared_error(y_val, p_val)**0.5
        return rmse * (-1)
    
    def build_features_dicts(self):
        assert (self.features is not None), "features must be set"
        
        # initialize the dictionaries
        features_dicts = {}
        for feature in self.features:
            features_dicts[feature] = {
                'name': feature,
                'bs': 'tp',
                'sp': None,
                'by': None,
                'k': None,
                'parametric': False
            }
            
        # set parameter values
        for arg in self.args_to_set:
            val = getattr(self, arg)
            if val is None:
                continue
            feature, parameter = arg.rsplit('_',1)
            features_dicts[feature][parameter] = val
            
        return features_dicts
    
    def build_formula_str(self, features_dicts):
        formula = 'NbBikes ~ '
        for feature, feature_dict in features_dicts.iteritems():
            if feature_dict['parametric']:
                formula += '%(name)s+' % feature_dict
                continue
                                
            tokens = feature_dict['name'].split('_')
            name, index = (tokens[0],None) if len(tokens) == 1 else (tokens[0], tokens[1])
            formula += "s(%s" % name.replace('And', ',')
            
            if feature_dict['bs'] is not None:
                formula += ", bs='%s'" % feature_dict['bs']
            if feature_dict['sp'] is not None:
                formula += ", sp=%f" % feature_dict['sp']
            if feature_dict['by'] is not None:
                formula += ", by=%s" % feature_dict['by']
            if feature_dict['k'] is not None:
                formula += ", k=%s" % feature_dict['k']
                
            formula += ")+" % feature_dict
        return formula[:-1]
    
class LRegressor(BaseEstimator, RegressorMixin):  
    def __init__(self, formula_str):
    	self.formula_str = formula_str
    def fit(self, X, y=None):            
        self.lr = stats.lm(Formula(self.formula_str), data=X)        
        return self
    def predict(self, X):
        assert (self.lr is not None), "LR must be set"
        p_val = clip_and_round(stats.predict(self.lr, newdata=X))
        return p_val
    
    def score(self, X):
        p_val = self.predict(X)
        y_val = X.NbBikes
        rmse = mean_squared_error(y_val, p_val)**0.5
        return rmse * (-1)
    
In [8]:
    
from sklearn.linear_model import LinearRegression
def fit_and_predict_lm(training, validation, test, formula):
    lm = LRegressor(formula_str=formula)
    lm.fit(training)
    return lm, clip_and_round(lm.predict(training)), clip_and_round(lm.predict(validation)), clip_and_round(lm.predict(test))
    
In [9]:
    
def fit_and_predict_gam(training, validation, test, formula):
    gam = GAMRegressor(formula_str=formula)
    gam.fit(training)
    return gam, clip_and_round(gam.predict(training)), clip_and_round(gam.predict(validation)), clip_and_round(gam.predict(test))
    
In [10]:
    
def fit_and_predict_hist_avg(training, validation, test):
    return None, training.HistAvg, validation.HistAvg, test.HistAvg
    
In [11]:
    
def model(df, station_ids, gam_formula, lr_formula, pred_col):
    results = []
    for station_id in station_ids:
        print 'Fitting %s' % station_id
            
        training, validation, test = split_datasets(df, station_id)      
        t_train = training[pred_col]
        t_validation = validation[pred_col]
        t_test = test[pred_col]
        
        try:            
            # Linear Model
            lm_fit = fit_and_predict_lm(training, validation, test, lr_formula)
            lm_rmse_train = mean_squared_error(t_train, lm_fit[1])**0.5
            lm_rmse_val = mean_squared_error(t_validation, lm_fit[2])**0.5
            lm_rmse_test = mean_squared_error(t_test, lm_fit[3])**0.5
        
            # GAM Model
            gam_fit = fit_and_predict_gam(training, validation, test, gam_formula)
            gam_rmse_train = mean_squared_error(t_train, gam_fit[1])**0.5
            gam_rmse_val = mean_squared_error(t_validation, gam_fit[2])**0.5
            gam_rmse_test = mean_squared_error(t_test, gam_fit[3])**0.5
            
            # Hist Avg
            havg_fit = fit_and_predict_hist_avg(training, validation, test)
            havg_rmse_train = mean_squared_error(t_train, havg_fit[1])**0.5
            havg_rmse_val = mean_squared_error(t_validation, havg_fit[2])**0.5
            havh_rmse_test = mean_squared_error(t_test, havg_fit[3])**0.5
        except Exception as e:
            logging.error(traceback.format_exc())
        
        results.append({'Id': station_id, 
                        'LR-TRAIN-ERR': lm_rmse_train, 'LR-VAL-ERR': lm_rmse_val, 'LR-TEST-ERR': lm_rmse_test,
                        'GAM-TRAIN-ERR': gam_rmse_train, 'GAM-VAL-ERR': gam_rmse_val, 'GAM-TEST-ERR': gam_rmse_test,
                        'HAVG-TRAIN-ERR': havg_rmse_train, 'HAVG-VAL-ERR': havg_rmse_val, 'HAVG-TEST-ERR': havh_rmse_test,#})
                        'LR-MOD': lm_fit[0], 'GAM-MOD': gam_fit[0]})
        
    return results
    
In [12]:
    
def convert_results_to_df(results, name):
    dfs = [pd.DataFrame(result).set_index('Id') for result in results]
    for i,df in enumerate(dfs):
        df.columns = ['%s-EXP%d-%s' % (name, i,col) for col in df.columns]
    return pd.concat(dfs, axis=1)
    
In [13]:
    
stations_to_use = readings.index.get_level_values(0).unique().tolist()
    
In [14]:
    
gam_formula_short = "NbBikes ~ s(TempTMinus2, HumidityTMinus2, bs='tp', sp=30.0) + s(TimeOfDay, by=Weekday, bs='tp', sp=1.1) "  
gam_formula_short += "+ s(TimeOfDay, by=Weekend, bs='tp', sp=50.0) + s(TimeOfDay, by=Holiday, bs='tp', sp=0.2) + s(NbBikesTMinus2, bs='tp', sp=8.0) "
gam_formula_short += "+ s(NbBikesTMinus3, bs='tp', sp=11.0) + RainTMinus2 + FogTMinus2 "
    
In [15]:
    
lr_formula_short = "NbBikes ~ TempTMinus2 + HumidityTMinus2 + TimeOfDay + Weekday "
lr_formula_short += "+ NbBikesTMinus2 + NbBikesTMinus3 + RainTMinus2 + FogTMinus2 "
    
In [16]:
    
# choose the columns to use in the model
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus2', 'FogTMinus2']
numeric_cols_short = ['HumidityTMinus2', 'TempTMinus2', 'TimeOfDay',
                      'NbBikesTMinus2', 'NbBikesTMinus3', 'HistAvg']                       
pred_col_short = 'NbBikes'
feature_cols_short = numeric_cols_short + boolean_cols_short
cols_short = [pred_col_short] + feature_cols_short
# select the columns chosen columns
readings_short = readings.loc[stations_to_use][cols_short]
# remove na
readings_short.dropna(inplace=True)
    
In [17]:
    
baseline_short = [model(readings_short, stations_to_use, gam_formula_short, lr_formula_short, 'NbBikes')]
    
    
In [18]:
    
baseline_short_df = convert_results_to_df(baseline_short, 'SHORT')
    
In [19]:
    
baseline_short_df
    
    Out[19]:
In [20]:
    
with open("data/parsed/models_short.p", "wb") as f:
    pickle.dump(baseline_short, f)
    
In [103]:
    
baseline_short_df.mean()
    
    Out[103]:
In [14]:
    
lr_formula_mid = "NbBikes ~ TempTMinus12 + HumidityTMinus12 + TimeOfDay + Weekday + DistNbBikes + CollNbBikes"
lr_formula_mid += "+ NbBikesTMinus12 + NbBikesTMinus18 + Near1TMinus12 + Near2TMinus12 + Near1TMinus18 + Near2TMinus18 "
    
In [15]:
    
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') "
    
In [16]:
    
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18', 'HistAvg', 
                    'Near1TMinus12', 'Near2TMinus12', 'Near1TMinus18', 'Near2TMinus18', 'DistNbBikes', 'CollNbBikes']
pred_col_mid = 'NbBikes'
feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid
# select the columns chosen columns
readings_mid = readings.loc[stations_to_use][cols_mid]
# remove na
readings_mid.dropna(inplace=True)
    
In [17]:
    
baseline_mid = [model(readings_mid, stations_to_use, gam_formula_mid, lr_formula_mid, 'NbBikes')]
    
    
In [21]:
    
baseline_mid_df = convert_results_to_df(baseline_mid, 'MID')
    
In [ ]:
    
with open("data/parsed/models_mid.p", "wb") as f:
    pickle.dump(baseline_mid, f)
    
In [157]:
    
# choose the columns to use in the model
redistribution_cols = ['CollNbBikesCum6', 'DistNbBikesCum6']
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay', 'HistAvg']
pred_col_long = 'NbBikes'
feature_cols_long = numeric_cols_long + boolean_cols_long + redistribution_cols
cols_long = [pred_col_long] + feature_cols_long
# select the columns chosen columns
readings_long = readings.loc[stations_to_use][cols_long]
# remove na
readings_long.dropna(inplace=True)
    
In [158]:
    
gam_formula_long = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp', sp=35.0) + s(TimeOfDay, by=Weekend, bs='tp', sp=0.7) "
gam_formula_long += "+ s(TimeOfDay, by=Holiday, bs='tp', sp=3.0) + s(HistAvg, bs='tp', k=5) + CollNbBikesCum6 + DistNbBikesCum6 "
    
In [159]:
    
lr_formula_long = "NbBikes ~ TimeOfDay + Weekday + Holiday + Weekend + HistAvg + CollNbBikesCum6 + DistNbBikesCum6"
    
In [160]:
    
all_long = [model(readings_long, stations_to_use, gam_formula_long, lr_formula_long, 'NbBikes')]
    
    
In [162]:
    
all_long_df = convert_results_to_df(all_long, 'LONG')
    
In [86]:
    
with open("data/parsed/models_long.p", "wb") as f:
    pickle.dump(all_long, f)
    
    
In [21]:
    
with open("data/parsed/models_short.p", "rb") as f:
    models_short = pickle.load(f)
with open("data/parsed/models_mid.p", "rb") as f:
    models_mid = pickle.load(f)
    
In [163]:
    
short_df = convert_results_to_df(models_short, 'SHORT')
mid_df = convert_results_to_df(models_mid, 'MID')
long_df = convert_results_to_df(all_long, 'LONG')
    
In [164]:
    
long_df.describe()
    
    Out[164]:
In [165]:
    
short_df['Scenario'] = 'Short-Term'
short_df.set_index('Scenario', append=True, inplace=True)
short_df = short_df.reorder_levels(['Scenario', 'Id'])[['SHORT-EXP0-HAVG-TEST-ERR', 'SHORT-EXP0-LR-TEST-ERR', 'SHORT-EXP0-GAM-TEST-ERR']]
short_df.columns=['HAVG-TEST-ERR', 'LR-TEST-ERR', 'GAM-TEST-ERR']
mid_df['Scenario'] = 'Mid-Term'
mid_df.set_index('Scenario', append=True, inplace=True)
mid_df = mid_df.reorder_levels(['Scenario', 'Id'])[['MID-EXP0-HAVG-TEST-ERR', 'MID-EXP0-LR-TEST-ERR', 'MID-EXP0-GAM-TEST-ERR']]
mid_df.columns=['HAVG-TEST-ERR', 'LR-TEST-ERR', 'GAM-TEST-ERR']
long_df['Scenario'] = 'Long-Term'
long_df.set_index('Scenario', append=True, inplace=True)
long_df = long_df.reorder_levels(['Scenario', 'Id'])[['LONG-EXP0-HAVG-TEST-ERR', 'LONG-EXP0-LR-TEST-ERR', 'LONG-EXP0-GAM-TEST-ERR']]
long_df.columns=['HAVG-TEST-ERR', 'LR-TEST-ERR', 'GAM-TEST-ERR']
    
In [166]:
    
aggregations = {
    'HAVG-TEST-ERR': {
        'mean': 'mean',
        'std': 'std'
    },
    'LR-TEST-ERR': {
        'mean': 'mean',
        'std': 'std'
    },
    'GAM-TEST-ERR': {
        'mean': 'mean',
        'std': 'std'
    }
}
    
In [167]:
    
rmse_results = pd.concat([short_df, mid_df, long_df])
rmse_results['Metric'] = 'RMSE'
    
In [168]:
    
wrmse_results = pd.concat([short_df, mid_df, long_df]).merge(readings.groupby(level='Id')['NbDocks'].mean().to_frame(), how='inner', right_index=True, left_index=True)
wrmse_results['HAVG-TEST-ERR'] = wrmse_results['HAVG-TEST-ERR'] / wrmse_results.NbDocks
wrmse_results['LR-TEST-ERR'] = wrmse_results['LR-TEST-ERR'] / wrmse_results.NbDocks
wrmse_results['GAM-TEST-ERR'] = wrmse_results['GAM-TEST-ERR'] / wrmse_results.NbDocks
wrmse_results['Metric'] = 'WRMSE'
wrmse_results.drop('NbDocks', axis=1, inplace=True)
    
In [169]:
    
results = pd.concat([rmse_results, wrmse_results]).reset_index().groupby(['Metric', 'Scenario']).agg(aggregations)
results['HAVG-TEST-ERR', 'HAVG-TEST-ERR'] = results['HAVG-TEST-ERR', 'mean'].map(str) + " (" + results['HAVG-TEST-ERR', 'std'].map(str) + ")"
results['LR-TEST-ERR', 'LR-TEST-ERR'] = results['LR-TEST-ERR', 'mean'].map(str) + " (" + results['LR-TEST-ERR', 'std'].map(str) + ")"
results['GAM-TEST-ERR', 'GAM-TEST-ERR'] = results['GAM-TEST-ERR', 'mean'].map(str) + " (" + results['GAM-TEST-ERR', 'std'].map(str) + ")"
results.columns = results.columns.droplevel()
results = results[['HAVG-TEST-ERR', 'LR-TEST-ERR', 'GAM-TEST-ERR']]
results.index.name = None
    
In [170]:
    
results
    
    Out[170]:
In [121]:
    
print results.to_latex()
    
    
In [ ]:
    
with open("data/parsed/r_eval_short.p", "wb") as f:
    pickle.dump(training, f)
    
In [81]:
    
baseline_short_df['SHORT-EXP0-GAM-TEST-ERR'].sort_values()[765/2:765/2 + 1]
    
    Out[81]:
In [82]:
    
training, validation, test = split_datasets(readings_short, 'BikePoints_148')
with open("data/parsed/r_eval_short.p", "wb") as f:
    pickle.dump(training, f)
    
In [99]:
    
with open("data/parsed/r_eval_short.p", "rb") as f:
    r_eval_short = pickle.load(f)
    
In [100]:
    
%load_ext rpy2.ipython
robjects.globalenv['data'] = r_eval_short
    
    
In [101]:
    
lr_formula_short
    
    Out[101]:
In [102]:
    
%%R
lrModel <- lm(NbBikes ~ TempTMinus2 + HumidityTMinus2 + TimeOfDay + Weekday + NbBikesTMinus2 + NbBikesTMinus3 + RainTMinus2 + FogTMinus2 , data=data)
summary(lrModel)
    
    
In [85]:
    
%%R
library(mgcv)
gamModel <- gam(NbBikes ~ s(TempTMinus2, HumidityTMinus2, bs='tp', sp=30.0) + s(TimeOfDay, by=Weekday, bs='tp', sp=1.1) 
                + s(TimeOfDay, by=Weekend, bs='tp', sp=50.0) + s(TimeOfDay, by=Holiday, bs='tp', sp=0.2) 
                + s(NbBikesTMinus2, bs='tp', sp=8.0) + s(NbBikesTMinus3, bs='tp', sp=11.0) 
                + RainTMinus2 + FogTMinus2, data=data)
summary(gamModel)
    
    
In [86]:
    
%%R
postscript("reports/images/short-check.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
gam.check(gamModel)
dev.off()
    
    
In [87]:
    
%%R
postscript("reports/images/short-splines.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
plot(gamModel, scale = 0)
layout(1)
dev.off()
    
    
In [88]:
    
%%R
postscript("reports/images/short-correlation.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
acf(resid(gamModel), lag.max = 36, main = "ACF")
pacf(resid(gamModel), lag.max = 36, main = "pACF")
layout(1)
dev.off()
    
    
In [89]:
    
baseline_mid_df['MID-EXP0-GAM-TEST-ERR'].sort_values()[765/2:765/2 + 1]
    
    Out[89]:
In [90]:
    
training, validation, test = split_datasets(readings_mid, 'BikePoints_231')
with open("data/parsed/r_eval_mid.p", "wb") as f:
    pickle.dump(training, f)
    
In [91]:
    
with open("data/parsed/r_eval_mid.p", "rb") as f:
    r_eval_mid = pickle.load(f)
    
In [92]:
    
%load_ext rpy2.ipython
robjects.globalenv['data'] = r_eval_mid
    
    
In [93]:
    
gam_formula_mid
    
    Out[93]:
In [94]:
    
%%R
library(mgcv)
gamModel <- gam(NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') 
                + s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') 
                + s(NbBikesTMinus12, bs='tp') + s(NbBikesTMinus18, bs='tp'), data=data)
summary(gamModel)
    
    
In [95]:
    
%%R
postscript("reports/images/mid-check.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
gam.check(gamModel)
dev.off()
    
    
In [96]:
    
%%R
postscript("reports/images/mid-splines.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
plot(gamModel, scale = 0)
layout(1)
dev.off()
    
    
In [97]:
    
%%R
postscript("reports/images/mid-correlation.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
acf(resid(gamModel), lag.max = 36, main = "ACF")
pacf(resid(gamModel), lag.max = 36, main = "pACF")
layout(1)
dev.off()
    
    
In [171]:
    
all_long_df['LONG-EXP0-GAM-TEST-ERR'].sort_values()[765/2:765/2 + 1]
    
    Out[171]:
In [40]:
    
gam_formula_long
    
    Out[40]:
In [172]:
    
training, validation, test = split_datasets(readings_long, 'BikePoints_610')
with open("data/parsed/r_eval_long.p", "wb") as f:
    pickle.dump(training, f)
    
In [173]:
    
with open("data/parsed/r_eval_long.p", "rb") as f:
    r_eval_long = pickle.load(f)
    
In [174]:
    
%load_ext rpy2.ipython
robjects.globalenv['data'] = r_eval_long
    
In [175]:
    
%%R
library(mgcv)
gamModel <- gam(NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp', sp=35.0) + s(TimeOfDay, by=Weekend, bs='tp', sp=0.7) 
                + s(TimeOfDay, by=Holiday, bs='tp', sp=3.0) + s(HistAvg, bs='tp', k=5) 
                + CollNbBikesCum6 + DistNbBikesCum6 , data=data)
summary(gamModel)
    
    
In [177]:
    
%%R
postscript("reports/images/long-check.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
gam.check(gamModel)
dev.off()
    
    
In [178]:
    
%%R
postscript("reports/images/long-splines.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
plot(gamModel, scale = 0)
layout(1)
dev.off()
    
    
In [179]:
    
%%R
postscript("reports/images/long-correlation.eps",horizontal=FALSE, paper="special",height=6,width=10, onefile=TRUE)
layout(matrix(1:2, ncol = 2))
acf(resid(gamModel), lag.max = 36, main = "ACF")
pacf(resid(gamModel), lag.max = 36, main = "pACF")
layout(1)
dev.off()
    
    
In [48]:
    
%run src/data/visualization.py
    
In [93]:
    
now = datetime(year=2016, month=6, day=22, hour=9, minute=0, second=0)
single_reading = readings.xs(now, level='Timestamp')
    
In [102]:
    
predictions = []
for idx, row in single_reading.iterrows():
    gam = baseline_mid_df.loc[idx]['MID-EXP0-GAM-MOD']
    prediction = gam.predict(row.to_frame().transpose())[0]
    predictions.append({'Prediction': prediction, 'Id': idx, 
                        'NbDocks': row.NbDocks, 
                        'HistAvg': row.HistAvg,
                        'Current': row.NbBikesTMinus12})
predictions = pd.DataFrame(predictions).set_index('Id')
    
In [103]:
    
predictions_df = add_station_info(predictions, stations.set_index('Id'), use_indexes=True)
predictions_df['NormalizedPrediction'] = predictions_df.Prediction.astype(float) / predictions_df.NbDocks.astype(float)
    
In [168]:
    
from palettable.colorbrewer.diverging import RdYlBu_10
def create_prediction_marker(station):
    
    font_color_now = 'red' if station.Current < 5 else 'black'
    font_color_mid = 'red' if station.Prediction < 5 else 'black'
    font_color_long = 'red' if station.HistAvg < 5 else 'black'
    
    html="""
    <h1>%s</h1>
    <p><strong>Id</strong>: %s</p>
    <p>
    <strong>Bicycle Availability:</strong>
    <ul>
        <font color="%s"><li><strong>%s:</strong> %d bicycles</li></font>
        <font color="%s"><li><strong>%s: </strong> %d bicycles</li></font>
        <font color="%s"><li><strong>%s: </strong> %d bicycles</li></font>
    </ul>
    </p>
    <p>
        <strong>Availability Trend:</strong><br/><br/>
        <img src="http://localhost:8888/files/reports/images/trend.png" width=210 height=210 hspace=16/>
    </p>
    """ % (station.Name, station.name, 
           font_color_now, now.strftime('%d/%m - %H:%M'), station.Current, 
           font_color_mid, (now + timedelta(hours=1)).strftime('%d/%m - %H:%M'), station.Prediction,
           font_color_long, (now + timedelta(hours=3)).strftime('%d/%m - %H:%M'), station.HistAvg)
    iframe = folium.element.IFrame(html=html, width=350, height=530)
    popup = folium.Popup(iframe, max_width=2650)
        
    fill_color = cmap_to_hex(RdYlBu_10.mpl_colormap, station.NormalizedPrediction)
    label = "%s - %s: %d bicycle(s)" 
    
    return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=100,
                        popup=popup, fill_color=fill_color, color=fill_color, fill_opacity=0.99)
predictions_map = draw_stations_map(predictions_df, create_prediction_marker)
    
In [169]:
    
folium.TileLayer('stamentoner').add_to(predictions_map)
folium.Map.save(predictions_map, 'reports/maps/predicted_availability_map.html')
predictions_map
    
    Out[169]:
In [ ]: