In [1]:
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
'figure.figsize' : (12, 10),
'axes.axisbelow' : True,
'lines.antialiased' : True,
'axes.titlesize' : 'xx-large',
'axes.labelsize' : 'x-large',
'xtick.labelsize' : 'large',
'ytick.labelsize' : 'large'}
for (k, v) in params.iteritems():
plt.rcParams[k] = v
In [3]:
df_status = pd.read_csv('DATA/babs_master/status_master_60m.csv')
In [4]:
df_status.head()
Out[4]:
In [5]:
df_status['time'] = pd.to_datetime(df_status['time'])
In [6]:
df_status['month'] = df_status['time'].apply(lambda x: x.month)
df_status['dow'] = df_status['time'].apply(lambda x: x.weekday())
df_status['weekend'] = df_status['dow'].isin([5, 6]) * 1
df_status['hour'] = df_status['time'].apply(lambda x: x.hour)
In [7]:
df_status = df_status[['bikes_available', 'month', 'dow', 'weekend', 'hour', 'station_id']]
In [8]:
df_status.head()
Out[8]:
In [9]:
X = df_status[df_status.columns[1:]].values
y = df_status.bikes_available.values
In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
In [11]:
rf = RandomForestRegressor(random_state=42)
In [12]:
rf.fit(X_train, y_train)
Out[12]:
In [13]:
yhat = np.round(rf.predict(X_test), 0)
In [14]:
mean_squared_error(y_test, yhat)**0.5
Out[14]:
In [15]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)
Out[15]:
In [16]:
with open('rf_regressor_availability.pickle','wb') as f:
pickle.dump(rf, f)
In [ ]: