In [72]:
import warnings

import pickle
import geopy
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import vincenty
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', 1800)
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

%matplotlib inline

In [41]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'figure.figsize' : (12, 10),
          'axes.axisbelow' : True,
          'lines.antialiased' : True,
          'axes.titlesize' : 'xx-large',
          'axes.labelsize' : 'x-large',
          'xtick.labelsize' : 'large',
          'ytick.labelsize' : 'large'}

for (k, v) in params.items():
    plt.rcParams[k] = v

In [42]:
df = pd.read_csv("DATA/babs_master/merged_master.csv")
df_station = pd.read_csv("DATA/babs_master/station_master.csv")
df_weather = pd.read_csv("DATA/babs_master/weather_master.csv")

In [43]:
distance = []
for i in range(len(df)):
    origin = (df['Start Latitute'][i], df['Start Longitude'][i])
    destination = (df['End Latitute'][i], df['End Longitude'][i])
    distance.append(vincenty(origin, destination).miles)

df['distance'] = distance

In [70]:
# df['distance'].mean()


Out[70]:
0.8308083377021603

In [44]:
print(df.columns)
print(df_station.columns)
print(df_weather.columns)
type(df_weather['Date'][1])


Index([u'Trip ID', u'Duration', u'Start Date Time', u'Start Station',
       u'Start Terminal', u'End Date Time', u'End Station', u'End Terminal',
       u'Bike #', u'Subscription Type', u'Zip Code', u'Start Date',
       u'Start Time', u'End Date', u'End Time', u'station_id_x',
       u'Start Latitute', u'Start Longitude', u'Start Station Dockcount',
       u'Start Station Landmark', u'Start Station Installation',
       u'station_id_y', u'End Latitute', u'End Longitude',
       u'End Station Dockcount', u'End Station Landmark',
       u'End Station Installation', u'Max_Temperature_F',
       u'Mean_Temperature_F', u'Min_TemperatureF', u'Max_Dew_Point_F',
       u'MeanDew_Point_F', u'Min_Dewpoint_F', u'Max_Humidity',
       u'Mean_Humidity ', u'Min_Humidity ', u'Max_Sea_Level_Pressure_In ',
       u'Mean_Sea_Level_Pressure_In ', u'Min_Sea_Level_Pressure_In ',
       u'Max_Visibility_Miles ', u'Mean_Visibility_Miles ',
       u'Min_Visibility_Miles ', u'Max_Wind_Speed_MPH ',
       u'Mean_Wind_Speed_MPH ', u'Max_Gust_Speed_MPH', u'Precipitation_In ',
       u'Cloud_Cover ', u'Events', u'Wind_Dir_Degrees', u'month',
       u'day_of_month', u'hour_of_day', u'day_of_week', u'weekend',
       u'Duration_in_mins', u'distance'],
      dtype='object')
Index([u'station_id', u'name', u'lat', u'long', u'dockcount', u'landmark',
       u'installation'],
      dtype='object')
Index([u'Date', u'Max_Temperature_F', u'Mean_Temperature_F',
       u'Min_TemperatureF', u'Max_Dew_Point_F', u'MeanDew_Point_F',
       u'Min_Dewpoint_F', u'Max_Humidity', u'Mean_Humidity ', u'Min_Humidity ',
       u'Max_Sea_Level_Pressure_In ', u'Mean_Sea_Level_Pressure_In ',
       u'Min_Sea_Level_Pressure_In ', u'Max_Visibility_Miles ',
       u'Mean_Visibility_Miles ', u'Min_Visibility_Miles ',
       u'Max_Wind_Speed_MPH ', u'Mean_Wind_Speed_MPH ', u'Max_Gust_Speed_MPH',
       u'Precipitation_In ', u'Cloud_Cover ', u'Events', u'Wind_Dir_Degrees',
       u'zip'],
      dtype='object')
Out[44]:
str

In [52]:
cols_target = ['Duration_in_mins']
features = ['month', 'day_of_month', 'hour_of_day', 'day_of_week', 'weekend', 'distance']

In [62]:
df['Duration_in_mins'].mean()


Out[62]:
18.484703939824318

In [53]:
df.shape


Out[53]:
(3235125, 56)

Modelling


In [54]:
X_train = df[features]
y_train = df[cols_target]

In [64]:
X_train.head()


Out[64]:
month day_of_month hour_of_day day_of_week weekend distance
0 8 29 14 4 0 0
1 8 29 14 4 0 0
2 8 29 14 4 0 0
3 8 29 14 4 0 0
4 8 29 14 4 0 0

In [56]:
names = X_train.columns[1:-1]

Random Forest Regressor


In [57]:
rf = RandomForestRegressor(random_state=1868)
rf.fit(X_train, y_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 2), rf.feature_importances_), names), reverse=True))


Features sorted by their score:
[(0.23, 'hour_of_day'), (0.15, 'day_of_week'), (0.12, 'day_of_month'), (0.01, 'weekend')]

In [61]:
abs(np.mean(cross_val_score(rf, X_train, y_train, cv=5, scoring='mean_squared_error')))**0.5


Out[61]:
560.33233701851179

In [73]:
with open('rf_regressor_duration.pickle','wb') as f:
    pickle.dump(rf, f)

Gradient Boosted Regression


In [ ]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6],
              'min_samples_leaf': [3, 5, 9, 17],
              # 'max_features': [1.0, 0.3, 0.1] ## not possible in our example (only 1 fx)
              }

est = GradientBoostingRegressor(n_estimators=100)
# this may take some minutes
gs_cv = GridSearchCV(est, param_grid, n_jobs=4).fit(X_train, y_train)

# best hyperparameter setting
gs_cv.best_params_

In [ ]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
abs(np.mean(cross_val_score(gbr, X_train, y_train, cv=5, scoring='mean_squared_error')))**0.5

In [ ]:
with open('gbr_regressor_duration.pickle','wb') as f:
    pickle.dump(gbr, f)