Train Models


In [1]:
%run src/data/visualization.py

In [4]:
%matplotlib inline

import math
import pytz 
import time
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

In [5]:
import pickle

statistics = pickle.load(open('data/parsed/stations_statistics.p', 'rb'))
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
readings = pickle.load(open("data/parsed/readings_full.p", "rb"))

Fill Redistribution

Column Analysis


In [6]:
columns = ['CollNbBikes', 'DewPt', 'DistNbBikes', 'Humidity', 'NbBikes', 'Rain', 'Temp', 'Visibility',
           'WindSpeed', 'TimeOfYear', 'TimeOfDay']
#columns = ['NbBikes', 'Rain', 'Temp']

data = readings[columns]

Pair Plot


In [ ]:
sns.set(style="ticks", color_codes=True)

g = sns.pairplot(data)

Correlation Plot


In [ ]:
corr = data.corr()

In [ ]:
import matplotlib.pyplot as plt

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=2, yticklabels=2,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

DayViews


In [7]:
readings['NAB'] = readings.NbBikes / (readings.NbBikes + readings.NbEmptyDocks)
readings['NAS'] = (readings.NbBikesTMinus1 - readings.NbBikes).apply(math.fabs) / (readings.NbBikes + readings.NbEmptyDocks)

In [8]:
dayviews = readings.query('Holiday == 0')[['NAB', 'NAS', 'TimeOfDay','Weekday']].dropna().reset_index().groupby(['Weekday', 'Id', 'TimeOfDay']).mean()

NAS


In [9]:
%run src/data/visualization.py
%run src/data/helper.py

In [10]:
from sklearn import preprocessing

NAS_weekend = dayviews.loc[0].groupby(level='Id').mean().NAS
NAS_weekend = pd.Series(preprocessing.MinMaxScaler().fit_transform(NAS_weekend.values.reshape(-1, 1)).reshape(len(NAS_weekend)), index=NAS_weekend.index)
NAS_weekend = add_station_info(pd.DataFrame(NAS_weekend, columns=['NAS']), stations.set_index('Id'), use_indexes=True)

NAS_weekday = dayviews.loc[1].groupby(level='Id').mean().NAS
NAS_weekday = pd.Series(preprocessing.MinMaxScaler().fit_transform(NAS_weekday.values.reshape(-1, 1)).reshape(len(NAS_weekday)), index=NAS_weekday.index)
NAS_weekday = add_station_info(pd.DataFrame(NAS_weekday, columns=['NAS']), stations.set_index('Id'), use_indexes=True)

In [34]:
import matplotlib.pyplot as plt

def nas_marker(station):
    color = cmap_to_hex(plt.get_cmap('autumn_r'), station.NAS) 
    label = "%s - %s" % (station.name, station['Name'])
    
    return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=75,
                        popup=label, fill_color=color, color=color, fill_opacity=0.9)

In [35]:
NAS_weekday_map = draw_stations_map(NAS_weekday, nas_marker)
folium.Map.save(NAS_weekday_map, 'reports/maps/weekday_activity.html')
NAS_weekday_map


Out[35]:

In [36]:
NAS_weekend_map = draw_stations_map(NAS_weekend, nas_marker)
folium.Map.save(NAS_weekend_map, 'reports/maps/weekends_activity.html')
NAS_weekend_map


Out[36]:

NAB


In [72]:
def get_nab(weekday, hour):
    nab = dayviews2.loc[weekday].xs((hour - 1)* 3600, level='TimeOfDay')[['NAB']]
    nab.NAB = preprocessing.MinMaxScaler().fit_transform(nab.values.reshape(-1, 1))
    return add_station_info(nab, stations.set_index('Id'), use_indexes=True)

def nab_marker(station):
    color = cmap_to_hex(plt.get_cmap('autumn_r'), station.NAB) 
    label = "%s - %s" % (station.name, station['Name'])
    
    return folium.CircleMarker(location=[station['Latitude'], station['Longitude']], radius=75,
                        popup=label, fill_color=color, color=color, fill_opacity=0.9)

Weekdays


In [69]:
dayviews2 = dayviews.reset_index()
dayviews2.TimeOfDay = dayviews2.TimeOfDay.astype(int)
dayviews2 = dayviews2.set_index(['Weekday', 'Id', 'TimeOfDay'])

In [73]:
NAB_weekday_7_map = draw_stations_map(get_nab(1,7), nab_marker)
folium.Map.save(NAB_weekday_7_map, 'reports/maps/weekday_availability_7.html')
NAB_weekday_7_map


Out[73]:

In [74]:
NAB_weekday_10_map = draw_stations_map(get_nab(1,10), nab_marker)
folium.Map.save(NAB_weekday_10_map, 'reports/maps/weekday_availability_10.html')
NAB_weekday_10_map


Out[74]:

In [75]:
NAB_weekday_16_map = draw_stations_map(get_nab(1,16), nab_marker)
folium.Map.save(NAB_weekday_16_map, 'reports/maps/weekday_availability_16.html')
NAB_weekday_16_map


Out[75]:

In [76]:
NAB_weekday_19_map = draw_stations_map(get_nab(1,19), nab_marker)
folium.Map.save(NAB_weekday_19_map, 'reports/maps/weekday_availability_19.html')
NAB_weekday_19_map


Out[76]:

Weekends


In [ ]:
NAB_weekend_11_map = draw_stations_map(get_nab(0,11), nab_marker)
folium.Map.save(NAB_weekend_11_map, 'reports/maps/weekend_availability_11.html')
NAB_weekend_11_map

In [ ]:
NAB_weekend_13_map = draw_stations_map(get_nab(0,13), nab_marker)
folium.Map.save(NAB_weekend_13_map, 'reports/maps/weekend_availability_13.html')
NAB_weekend_13_map

In [ ]:
NAB_weekend_15_map = draw_stations_map(get_nab(0,15), nab_marker)
folium.Map.save(NAB_weekend_15_map, 'reports/maps/weekend_availability_15.html')
NAB_weekend_15_map

In [ ]:
NAB_weekend_19_map = draw_stations_map(get_nab(0,19), nab_marker)
folium.Map.save(NAB_weekend_19_map, 'reports/maps/weekend_availability_15.html')
NAB_weekend_19_map

In [ ]:
from matplotlib.ticker import FuncFormatter

timeofday_formatter = FuncFormatter(lambda x, pos: x / 3600)
station_id = 'BikePoints_374'

In [ ]:
ax = pd.concat([dayviews.loc[0, station_id].NAB.rename('Weekends'), 
                dayviews.loc[1, station_id].NAB.rename('Weekdays')], 
               axis=1).plot(xticks = range(7200, 25 * 3600, 14400), xlim=(21600, 86400))

ax.xaxis.set_major_formatter(timeofday_formatter)

In [ ]:
ax = pd.concat([dayviews.loc[0, station_id].NAS.rename('Weekends'), 
                dayviews.loc[1, station_id].NAS.rename('Weekdays')], 
               axis=1).plot(xticks = range(7200, 25 * 3600, 14400), xlim=(21600, 86400))

ax.xaxis.set_major_formatter(timeofday_formatter)

In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAB', unit='Id', condition='Weekday', data=dayviews.reset_index())

# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')

In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAS', unit='Id', condition='Weekday', data=dayviews.reset_index())

# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')

In [ ]:
priority1_ids = stations.query('Priority == "1"').Id
priority2_ids = stations.query('Priority == "2"').Id
priority3_ids = stations.query('(Priority != Priority)').Id

In [ ]:
dayviews1 = dayviews.loc[1].sort_index().loc[priority1_ids.values.tolist()]
dayviews2 = dayviews.reset_index(level=0).sort_index().loc[priority2_ids.values.tolist()]
dayviews3 = dayviews.reset_index(level=0).sort_index().loc[priority3_ids.values.tolist()]

In [ ]:
ax = sns.tsplot(time='TimeOfDay', value='NAS', unit='Id', condition='Weekday', data=dayviews3.reset_index())

# set ticks
ax.xaxis.set_major_formatter(timeofday_formatter)
ax.set_xticks(range(7200, 25 * 3600, 14400))
# set legends
legends = ax.legend().get_texts()
legends[0].set_text('Weekends')
legends[1].set_text('Weekdays')

Clusters


In [6]:
from cdtw import cdtw_sakoe_chiba

def dtw_sakoe_chiba(a,b):
    return cdtw_sakoe_chiba(a, b, 12)

In [7]:
data = dayviews.unstack().loc[1].NAS

In [8]:
from scipy.spatial.distance import pdist,squareform

dist_condensed = pdist(data.values, dtw_sakoe_chiba)
dist_matrix = squareform(dist_condensed)

In [9]:
from sklearn.cluster import AgglomerativeClustering

agg_clustering = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="complete")
clusters = pd.Series(agg_clustering.fit_predict(dist_matrix), index=data.index)

In [10]:
clusters.value_counts()


Out[10]:
3    505
1    250
2      7
0      6
4      5
dtype: int64

In [13]:
data = add_station_info(clusters.to_frame('Cluster'), stations.set_index('Id'), use_indexes=True)
data.query('Priority == "1"').Cluster.value_counts()


Out[13]:
1    57
2     6
4     5
3     5
0     4
Name: Cluster, dtype: int64

In [20]:
clusters_df = dayviews.unstack().loc[1].NAS.copy()
clusters_df['Cluster'] = clusters
clusters_df = clusters_df[['Cluster']]
clusters_df.Cluster.nunique()


Out[20]:
1

In [ ]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
for col in statistics.columns.difference(['Latitude', 'Longitude', 'Priority']):
    std_col = '%sS' % col    
    statistics[std_col] = min_max_scaler.fit_transform(statistics[col].values.reshape(-1, 1))

In [ ]:
from sklearn.cluster import KMeans

statistics = statistics.sort_values(by=['Priority'])

priority_clusters = [(3,1), (2,2), (2,3)]
cluster_cols = ['EmptyEveningPeakS', 'EmptyMorningPeakS', 'EmptyNonPeakS', 
                'FullEveningPeakS', 'FullMorningPeakS', 'FullNonPeakS',
                'CountS']

clusters = []
offset = 0
for cls_prior in priority_clusters:
    n_clusters, priority = cls_prior
    window = statistics[statistics.Priority == priority][cluster_cols]
    p_clusters = KMeans(n_clusters=n_clusters).fit_predict(window.values)
    clusters.extend(p_clusters + offset) 
    
    offset += n_clusters
    
statistics['Cluster'] = clusters

In [ ]:
statistics.Cluster.hist(bins=7)

In [ ]:
Dark2_7.show_discrete_image()

In [ ]:
draw_stations_map(statistics, create_cluster_marker('Cluster'))

Split Dataset


In [ ]:
split_training = lambda df: df[datetime(2016,5,15,0,0,0,0):datetime(2016,6,12,23,59,59,999999)]
split_validation = lambda df: df[datetime(2016,6,13,0,0,0,0):datetime(2016,6,19,23,59,59,999999)]
split_test = lambda df: df[datetime(2016,5,20,0,0,0,0):datetime(2016,6,26,23,59,59,999999)]

Model


In [ ]:
def split_datasets(df, station_id):
    station_df = df.loc[station_id]
    training = split_training(station_df)
    validation = split_validation(station_df)
    test = split_test(station_df)
    
    return training, validation, test

In [ ]:
import sys

def clip_and_round(arr):
    arr = np.clip(arr, 0, sys.maxint)
    return np.round(arr)

In [ ]:
import inspect

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error

from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import IntVector, Formula

pandas2ri.activate()

r = robjects.r
base = importr('base')
stats = importr('stats')
mgcv = importr('mgcv')

class GAMRegressor(BaseEstimator, RegressorMixin):  
    """An example of classifier"""

    def __init__(self, features=None, formula_str=None,
                 Fog_parametric=None, Rain_parametric=None, TempAndHumidity_k=None,
                 TMinus1_k=None, TMinus2_k=None, 
                 TMinus6_k=None, TMinus12_k=None, 
                 TimeOfYear_k=None, 
                 TimeOfDay_1_k=None, TimeOfDay_2_k=None, TimeOfDay_3_k=None,
                 TimeOfDay_1_bs=None, TimeOfDay_2_bs=None, TimeOfDay_3_bs=None,
                 TimeOfDay_1_by=None, TimeOfDay_2_by=None, TimeOfDay_3_by=None):
        
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        self.args_to_set = []
        for arg, val in values.items():
            # save a list of the arguments
            if arg != 'features' and arg != 'formula_str':
                self.args_to_set.append(arg)
            setattr(self, arg, val)

    def fit(self, X, y=None): 
        if self.formula_str is None:
            features_dicts = self.build_features_dicts()
            self.formula_str = self.build_formula_str(features_dicts)
            #print self.formula_str
            
        self.gam = mgcv.gam(Formula(self.formula_str), data=X)
        
        return self

    def predict(self, X):
        assert (self.gam is not None), "GAM must be set"
        p_val = clip_and_round(stats.predict(self.gam, newdata=X))
        return p_val
    
    def score(self, X):
        p_val = self.predict(X)
        y_val = X.NbBikes
        rmse = mean_squared_error(y_val, p_val)**0.5
        return rmse * (-1)
    
    def build_features_dicts(self):
        assert (self.features is not None), "features must be set"
        
        # initialize the dictionaries
        features_dicts = {}
        for feature in self.features:
            features_dicts[feature] = {
                'name': feature,
                'bs': 'tp',
                'sp': None,
                'by': None,
                'k': None,
                'parametric': False
            }
            
        # set parameter values
        for arg in self.args_to_set:
            val = getattr(self, arg)
            if val is None:
                continue
            feature, parameter = arg.rsplit('_',1)
            features_dicts[feature][parameter] = val
            
        return features_dicts
    
    def build_formula_str(self, features_dicts):
        formula = 'NbBikes ~ '
        for feature, feature_dict in features_dicts.iteritems():
            if feature_dict['parametric']:
                formula += '%(name)s+' % feature_dict
                continue
                                
            tokens = feature_dict['name'].split('_')
            name, index = (tokens[0],None) if len(tokens) == 1 else (tokens[0], tokens[1])
            formula += "s(%s" % name.replace('And', ',')
            
            if feature_dict['bs'] is not None:
                formula += ", bs='%s'" % feature_dict['bs']
            if feature_dict['sp'] is not None:
                formula += ", sp=%0.3f" % feature_dict['sp']
            if feature_dict['by'] is not None:
                formula += ", by=%s" % feature_dict['by']
            if feature_dict['k'] is not None:
                formula += ", k=%s" % feature_dict['k']
                
            formula += ")+" % feature_dict
        return formula[:-1]

In [ ]:
from sklearn.linear_model import LinearRegression

def fit_and_predict_lm(X_train, y_train, X_val):
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    return lm, clip_and_round(lm.predict(X_val))

In [ ]:
def fit_and_predict_gam(training, validation, formula):
    gam = GAMRegressor(formula_str=formula)
    gam.fit(training)
    return gam, clip_and_round(gam.predict(validation))

In [ ]:
def model(df, station_ids, gam_formula, feature_cols, pred_col):
    results = []

    for station_id in station_ids:
        print 'Fitting %s' % station_id
            
        training, validation, test = split_datasets(df, station_id)                
        
        X_train, y_train = training[feature_cols], training[pred_col]
        X_val, y_val = validation[feature_cols], validation[pred_col]
        
        try:            
            # Linear Model
            lm_fit = fit_and_predict_lm(X_train, y_train, X_val)
            lm_rmse = mean_squared_error(y_val, lm_fit[1])**0.5
        
            # GAM Model
            gam_fit = fit_and_predict_gam(training, validation, gam_formula)
            gam_rmse = mean_squared_error(y_val, gam_fit[1])**0.5
        except:
            print 'Error in station %s' % station_id
        
        results.append({'Id': station_id, 'LM': lm_rmse, 'GAM': gam_rmse, 'LM_p': lm_fit[1], 'GAM_p': gam_fit[1]})
        
    return results

Short Term


In [ ]:
# choose the columns to use in the model
boolean_cols_short = ['Weekday', 'Weekend', 'Holiday', 'RainTMinus1', 'FogTMinus1']
numeric_cols_short = ['HumidityTMinus1', 'TempTMinus1', 'TimeOfDay', 'NbBikesTMinus1', 'NbBikesTMinus2']
pred_col_short = 'NbBikes'

feature_cols_short = numeric_cols_short + boolean_cols_short
cols_short = [pred_col_short] + feature_cols_short

# select the columns chosen columns
readings_short = readings[cols_short].copy()

# remove na
readings_short.dropna(inplace=True)

In [ ]:
gam_formula_short = "NbBikes ~ s(TempTMinus1, HumidityTMinus1, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "  
gam_formula_short += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus1, bs='tp') "
gam_formula_short += "+ s(NbBikesTMinus2, bs='tp') + RainTMinus1 + FogTMinus1 "

station_ids = readings_short.index.get_level_values('Id').unique()        
results_short = model(readings_short, station_ids, gam_formula_short, feature_cols_short, pred_col_short)

In [ ]:
%run src/data/visualization.py

In [ ]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df = add_station_info(pd.DataFrame(results_short)[['Id', 'GAM', 'LM']], stations)
df['Priority'] = df.Priority.fillna(0).astype('int16')
df['GAMS'] = min_max_scaler.fit_transform(df['GAM'].values.reshape(-1, 1))
df['LMS'] = min_max_scaler.fit_transform(df['LM'].values.reshape(-1, 1))

In [ ]:
draw_stations_map(df, create_result_marker('GAMS'))

Mid Term


In [ ]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'

feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid

# select the columns chosen columns
readings_mid = readings[cols_mid].copy()

# remove na
readings_mid.dropna(inplace=True)

In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') "

station_ids = readings_mid.index.get_level_values('Id').unique()        
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)

In [ ]:
pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()

Long Term


In [ ]:
# choose the columns to use in the model
boolean_cols_long = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_long = ['TimeOfDay']
pred_col_long = 'NbBikes'

feature_cols_long = numeric_cols_long + boolean_cols_long
cols_long = [pred_col_long] + feature_cols_long

# select the columns chosen columns
readings_long = readings[cols_long].copy()

# remove na
readings_long.dropna(inplace=True)

In [ ]:
gam_formula_long = "NbBikes ~ s(TimeOfDay, by=Weekday, bs='tp') "  
gam_formula_long += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') "

station_ids = readings_long.index.get_level_values('Id').unique()        
results_long = model(readings_long, station_ids, gam_formula_long, feature_cols_long, pred_col_long)

In [ ]:
pd.DataFrame(results_long)[['Id', 'GAM', 'LM']].mean()

from sklearn.grid_search import GridSearchCV

training, validation, test = split_datasets(readings_mid, 'BikePoints_101') search_dataset = pd.concat([training, validation])

features of the model

features = ['TempAndHumidity', 'TimeOfDay_1', 'TimeOfDay_2', 'TimeOfDay_3', 'TMinus6', 'TMinus12', 'Rain', 'Fog']

parameters to tune via cross validation

parameters = [{'TimeOfDay_1_by': ['Weekday'], 'TimeOfDay_2_by': ['Weekend'], 'TimeOfDay_3_by': ['Holiday'], 'Fog_parametric': [True], 'Rain_parametric': [True], 'TMinus6_k': [3,4,5], 'TMinus12_k': [6,7,8], 'TempAndHumidity_k': [15,16,17], 'TimeOfDay_1_k': [7,8,9], 'TimeOfDay_2_k': [6,7,8], 'TimeOfDay_3_k': [7,8,9], 'TimeOfDay_1_bs': ['cc', 're', 'tp'], 'TimeOfDay_2_bs': ['cc', 're', 'tp'], 'TimeOfDay_3_bs': ['cc', 're', 'tp'],

           #'TMinus6_sp': np.arange(0,1,0.2), 'TMinus12_sp': np.arange(0,1,0.2),
           #'TimeOfDay_1_sp': np.arange(0,1,0.2), 'TimeOfDay_2_sp': np.arange(0,1,0.2), 'TimeOfDay_3_sp': np.arange(0,1,0.2),
           'features': [features]}]

tuning hyper parameters

clf = GridSearchCV(GAMRegressor(), parameters, cv=4) clf.fit(training)

print 'Best parameters set found on dev set:' print clf.bestparams

print 'Grid scores on development set:' for params, mean_score, scores in clf.gridscores: print '%0.3f (+/-%0.03f) for %r' % (mean_score, scores.std() * 2, params)

print 'The model is trained on the full development set.' print 'The scores are computed on the full evaluation set.' p_test = clf.predict(validation) t_test = validation.NbBikes rmse = mean_squared_error(t_test, p_test)**0.5 rmse

GAM Analysis


In [ ]:
%load_ext rpy2.ipython

In [ ]:
# choose the columns to use in the model
pred_col_mid = 'NbBikes'
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18', 
                    'DistNbBikes', 'CollNbBikes']

feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid

# select the columns chosen columns
readings_mid = readings[cols_mid].copy()

In [ ]:
readings_mid.DistNbBikes = readings_mid.DistNbBikes.fillna(method='ffill', limit=0).fillna(0)
readings_mid.CollNbBikes = readings_mid.CollNbBikes.fillna(method='ffill', limit=0).fillna(0)

# remove na
readings_mid.dropna(inplace=True)

In [ ]:
training, validation, test = split_datasets(readings_mid, 'BikePoints_374')

robjects.globalenv['training'] = training
robjects.globalenv['validation'] = validation

In [ ]:
%%R

library(mgcv)

gamModel <- mgcv::gam(NbBikes ~ 
                      s(TempTMinus12, HumidityTMinus12, bs='tp') 
                      + s(TimeOfDay, by=Weekday, bs='cc')                  
                      + s(TimeOfDay, by=Weekend, bs='cc')               
                      + s(TimeOfDay, by=Holiday, bs='cc')               
                      + s(NbBikesTMinus12, bs='tp')
                      + s(NbBikesTMinus18, bs='tp')                      
                      + DistNbBikes + CollNbBikes
                      ,data=training)

print(summary(gamModel))

p_val <- predict(gamModel, newdata=validation)
error <- validation$NbBikes - p_val
rmse <- function(error)
{
    sqrt(mean(error^2))
}
print(rmse(error))
    
layout(matrix(1:3, ncol = 3))
plot(gamModel, scale = 0)
layout(1)
    
#layout(matrix(1:3, ncol = 3))
#acf(resid(gamModel), lag.max = 36, main = "ACF")
#pacf(resid(gamModel), lag.max = 36, main = "pACF")
#layout(1)
    
#gamModel$lme

Bicycle Redistribution


In [ ]:
start_time = time.time()

distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb'))
distributed.sort_values(by=['Id', 'Timestamp'], inplace=True)
distributed = distributed.query('NbBikes != 0')
distributed.set_index(['Id', 'Timestamp'], inplace=True)
distributed.drop(['ShortName', 'Name'], axis=1, inplace=True)

collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb'))
collected.sort_values(by=['Id', 'Timestamp'], inplace=True)
collected = collected.query('NbBikes != 0')
collected.set_index(['Id', 'Timestamp'], inplace=True)
collected.drop(['ShortName', 'Name'], axis=1, inplace=True)

end_time = time.time()
print 'Opening redistribution data took %s' % (end_time - start_time)

In [ ]:
station_ids = readings.index.get_level_values('Id').unique().tolist()
station_ids = station_ids[0:100]
stations_ids = ['BikePoints_374']

In [ ]:
# choose the columns to use in the model
boolean_cols_mid = ['Weekday', 'Weekend', 'Holiday']
numeric_cols_mid = ['HumidityTMinus12', 'TempTMinus12', 'TimeOfDay', 'NbBikesTMinus12', 'NbBikesTMinus18']
pred_col_mid = 'NbBikes'

feature_cols_mid = numeric_cols_mid + boolean_cols_mid
cols_mid = [pred_col_mid] + feature_cols_mid

# select the columns chosen columns
readings_mid = readings[cols_mid].copy()

# remove na
readings_mid.dropna(inplace=True)

readings_mid = readings_mid.loc[stations_ids]

In [ ]:
dist_sum = np.zeros(len(readings_mid))
dist_events = np.zeros(len(readings_mid))
coll_sum = np.zeros(len(readings_mid))
coll_events = np.zeros(len(readings_mid))

period = 1
i = 0
for idx, row in readings_mid.iterrows():
    station_id, end = idx
    start = end - pd.Timedelta(minutes=5*period)
    
    if station_id in distributed.index:    
        distributed_window = distributed.loc[station_id].loc[start:end]
        dist_sum[i] = distributed_window.sum()
        dist_events[i] = distributed_window.count() 
    
    if station_id in collected.index:    
        collected_window = collected.loc[station_id].loc[start:end]
        coll_sum[i] = collected_window.sum()
        coll_events[i] = collected_window.count()  
    
    i+=1
    
readings_mid['DistSumTMinus6'] = dist_sum
readings_mid['DistEvtTMinus6'] = dist_events
readings_mid['CollSumTMinus6'] = coll_sum
readings_mid['CollEvtTMinus6'] = coll_events

In [ ]:
readings_mid.to_csv('test.csv')
print 'a'

In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp')"

station_ids = readings_mid.index.get_level_values('Id').unique()        
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)

pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()

In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistSumTMinus6 + CollSumTMinus6 "

station_ids = readings_mid.index.get_level_values('Id').unique()        
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)

pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()

In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistEvtTMinus6 + CollEvtTMinus6"

station_ids = readings_mid.index.get_level_values('Id').unique()        
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)

pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()

In [ ]:
gam_formula_mid = "NbBikes ~ s(TempTMinus12, HumidityTMinus12, bs='tp') + s(TimeOfDay, by=Weekday, bs='tp') "
gam_formula_mid += "+ s(TimeOfDay, by=Weekend, bs='tp') + s(TimeOfDay, by=Holiday, bs='tp') + s(NbBikesTMinus12, bs='tp') "
gam_formula_mid += "+ s(NbBikesTMinus18, bs='tp') + DistSumTMinus6 + CollSumTMinus6 + DistEvtTMinus6 + CollEvtTMinus6"

station_ids = readings_mid.index.get_level_values('Id').unique()        
results_mid = model(readings_mid, station_ids, gam_formula_mid, feature_cols_mid, pred_col_mid)

pd.DataFrame(results_mid)[['Id', 'GAM', 'LM']].mean()