In [72]:
import warnings
import pickle
import geopy
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import vincenty
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
pd.set_option('display.max_columns', 1800)
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
%matplotlib inline
In [41]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
'figure.figsize' : (12, 10),
'axes.axisbelow' : True,
'lines.antialiased' : True,
'axes.titlesize' : 'xx-large',
'axes.labelsize' : 'x-large',
'xtick.labelsize' : 'large',
'ytick.labelsize' : 'large'}
for (k, v) in params.items():
plt.rcParams[k] = v
In [42]:
df = pd.read_csv("DATA/babs_master/merged_master.csv")
df_station = pd.read_csv("DATA/babs_master/station_master.csv")
df_weather = pd.read_csv("DATA/babs_master/weather_master.csv")
In [43]:
distance = []
for i in range(len(df)):
origin = (df['Start Latitute'][i], df['Start Longitude'][i])
destination = (df['End Latitute'][i], df['End Longitude'][i])
distance.append(vincenty(origin, destination).miles)
df['distance'] = distance
In [70]:
# df['distance'].mean()
Out[70]:
In [44]:
print(df.columns)
print(df_station.columns)
print(df_weather.columns)
type(df_weather['Date'][1])
Out[44]:
In [52]:
cols_target = ['Duration_in_mins']
features = ['month', 'day_of_month', 'hour_of_day', 'day_of_week', 'weekend', 'distance']
In [62]:
df['Duration_in_mins'].mean()
Out[62]:
In [53]:
df.shape
Out[53]:
In [54]:
X_train = df[features]
y_train = df[cols_target]
In [64]:
X_train.head()
Out[64]:
In [56]:
names = X_train.columns[1:-1]
In [57]:
rf = RandomForestRegressor(random_state=1868)
rf.fit(X_train, y_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 2), rf.feature_importances_), names), reverse=True))
In [61]:
abs(np.mean(cross_val_score(rf, X_train, y_train, cv=5, scoring='mean_squared_error')))**0.5
Out[61]:
In [73]:
with open('rf_regressor_duration.pickle','wb') as f:
pickle.dump(rf, f)
In [ ]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
'max_depth': [4, 6],
'min_samples_leaf': [3, 5, 9, 17],
# 'max_features': [1.0, 0.3, 0.1] ## not possible in our example (only 1 fx)
}
est = GradientBoostingRegressor(n_estimators=100)
# this may take some minutes
gs_cv = GridSearchCV(est, param_grid, n_jobs=4).fit(X_train, y_train)
# best hyperparameter setting
gs_cv.best_params_
In [ ]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=6, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
abs(np.mean(cross_val_score(gbr, X_train, y_train, cv=5, scoring='mean_squared_error')))**0.5
In [ ]:
with open('gbr_regressor_duration.pickle','wb') as f:
pickle.dump(gbr, f)