Imports



In [1]:

    
import pickle
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

plt.style.use('ggplot')
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:

    
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'figure.figsize' : (12, 10),
          'axes.axisbelow' : True,
          'lines.antialiased' : True,
          'axes.titlesize' : 'xx-large',
          'axes.labelsize' : 'x-large',
          'xtick.labelsize' : 'large',
          'ytick.labelsize' : 'large'}

for (k, v) in params.iteritems():
    plt.rcParams[k] = v

Data



In [3]:

    
df_status = pd.read_csv('DATA/babs_master/status_master_60m.csv')



In [4]:

    
df_status.head()









    Out[4]:






  
    
      
      station_id
      time
      bikes_available
    
  
  
    
      0
      2
      2013-08-29 12:00:00
      2
    
    
      1
      2
      2013-08-29 13:00:00
      3
    
    
      2
      2
      2013-08-29 14:00:00
      2
    
    
      3
      2
      2013-08-29 15:00:00
      2
    
    
      4
      2
      2013-08-29 16:00:00
      2



In [5]:

    
df_status['time'] = pd.to_datetime(df_status['time'])



In [6]:

    
df_status['month'] = df_status['time'].apply(lambda x: x.month)
df_status['dow'] = df_status['time'].apply(lambda x: x.weekday())
df_status['weekend'] = df_status['dow'].isin([5, 6]) * 1
df_status['hour'] = df_status['time'].apply(lambda x: x.hour)



In [7]:

    
df_status = df_status[['bikes_available', 'month', 'dow', 'weekend', 'hour', 'station_id']]



In [8]:

    
df_status.head()









    Out[8]:






  
    
      
      bikes_available
      month
      dow
      weekend
      hour
      station_id
    
  
  
    
      0
      2
      8
      3
      0
      12
      2
    
    
      1
      3
      8
      3
      0
      13
      2
    
    
      2
      2
      8
      3
      0
      14
      2
    
    
      3
      2
      8
      3
      0
      15
      2
    
    
      4
      2
      8
      3
      0
      16
      2

Model

Train Test Split



In [9]:

    
X = df_status[df_status.columns[1:]].values
y = df_status.bikes_available.values



In [10]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)



In [11]:

    
rf = RandomForestRegressor(random_state=42)



In [12]:

    
rf.fit(X_train, y_train)









    Out[12]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)



In [13]:

    
yhat = np.round(rf.predict(X_test), 0)



In [14]:

    
mean_squared_error(y_test, yhat)**0.5









    Out[14]:





3.3566612332486216

All



In [15]:

    
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)









    Out[15]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)



In [16]:

    
with open('rf_regressor_availability.pickle','wb') as f:
    pickle.dump(rf, f)



In [ ]:

	station_id	time	bikes_available
0	2	2013-08-29 12:00:00	2
1	2	2013-08-29 13:00:00	3
2	2	2013-08-29 14:00:00	2
3	2	2013-08-29 15:00:00	2
4	2	2013-08-29 16:00:00	2