Imports


In [1]:
import pickle
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

plt.style.use('ggplot')
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'figure.figsize' : (12, 10),
          'axes.axisbelow' : True,
          'lines.antialiased' : True,
          'axes.titlesize' : 'xx-large',
          'axes.labelsize' : 'x-large',
          'xtick.labelsize' : 'large',
          'ytick.labelsize' : 'large'}

for (k, v) in params.iteritems():
    plt.rcParams[k] = v

Data


In [3]:
df_status = pd.read_csv('DATA/babs_master/status_master_60m.csv')

In [4]:
df_status.head()


Out[4]:
station_id time bikes_available
0 2 2013-08-29 12:00:00 2
1 2 2013-08-29 13:00:00 3
2 2 2013-08-29 14:00:00 2
3 2 2013-08-29 15:00:00 2
4 2 2013-08-29 16:00:00 2

In [5]:
df_status['time'] = pd.to_datetime(df_status['time'])

In [6]:
df_status['month'] = df_status['time'].apply(lambda x: x.month)
df_status['dow'] = df_status['time'].apply(lambda x: x.weekday())
df_status['weekend'] = df_status['dow'].isin([5, 6]) * 1
df_status['hour'] = df_status['time'].apply(lambda x: x.hour)

In [7]:
df_status = df_status[['bikes_available', 'month', 'dow', 'weekend', 'hour', 'station_id']]

In [8]:
df_status.head()


Out[8]:
bikes_available month dow weekend hour station_id
0 2 8 3 0 12 2
1 3 8 3 0 13 2
2 2 8 3 0 14 2
3 2 8 3 0 15 2
4 2 8 3 0 16 2

Model

Train Test Split


In [9]:
X = df_status[df_status.columns[1:]].values
y = df_status.bikes_available.values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
rf = RandomForestRegressor(random_state=42)

In [12]:
rf.fit(X_train, y_train)


Out[12]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [13]:
yhat = np.round(rf.predict(X_test), 0)

In [14]:
mean_squared_error(y_test, yhat)**0.5


Out[14]:
3.3566612332486216

All


In [15]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)


Out[15]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [16]:
with open('rf_regressor_availability.pickle','wb') as f:
    pickle.dump(rf, f)

In [ ]: