In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import KFold
data_path = '/devdata/course_proj/aml/'
In [2]:
train = pd.read_csv(data_path+'train.csv').ix[:,1:]
testSet = pd.read_csv(data_path+'test.csv').ix[:,1:]
In [ ]:
In [3]:
# encode features for random forest and concatenate them back to dataframe
train = pd.concat([train, train['season'].str.get_dummies(sep=',')], axis=1)
train = pd.concat([train, train['dayOfWeek'].str.get_dummies(sep=',')], axis=1)
testSet = pd.concat([testSet, testSet['season'].str.get_dummies(sep=',')], axis=1)
testSet = pd.concat([testSet, testSet['dayOfWeek'].str.get_dummies(sep=',')], axis=1)
In [8]:
#gredient boosting regressor For Feature Extraction
def gbm_features(traned_gbm, data_x):
# gredient boosting regressor with least square loss function
X_gbm = gbm.transform(data_x)
return X_gbm
params = {'n_estimators': 200, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}
gbm = GradientBoostingRegressor(**params)
gbm.fit(X_train.values, y_train.values)
X_gbm_train = gbm.transform(X_train.values)
X_gbm_test = gbm.transform(X_test.values)
In [ ]:
In [33]:
frame_x_train = [X_train, pd.DataFrame(X_gbm_train)]
x_train_concat_train = pd.concat(frame_x_train,axis=1)
frame_x_test = [X_test, pd.DataFrame(X_gbm_test)]
x_train_concat_test = pd.concat(frame_x_test,axis=1)
In [4]:
# we don't have 'spring', 'summer', 'fall' in current test set
cols = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
'holiday','winter', 'stationID',
'max', 'min', 'rain' ,'snow']
X_train = train[cols]
y_train = train.visited
X_test = testSet[cols]
y_test = testSet.visited
In [34]:
# random forest regressor
# deterine how many trees to use
rmse = []
r2 = []
# numTrees = range(50,100)
# for i in numTrees:
# print i
rf = RandomForestRegressor(n_estimators=200)
rf.fit(x_train_concat_train, y_train)
r2.append(r2_score(y_test, rf.predict(x_train_concat_test)))
rmse.append(np.sqrt(np.mean((y_test - rf.predict(x_train_concat_test))**2)))
print rmse
In [ ]:
from sklearn.svm import SVR
svr_rmse = []
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)
svr_rmse.append(np.sqrt(np.mean((y_test - y_rbf)**2)))
print svr_rmse
In [11]:
bestTreeNum = numTrees[np.argmin(rmse)]
# 1 standard deviation rule
std = np.std(rmse)
for i in range(len(rmse)):
if rmse[i] < min(rmse) + std:
minTree = numTrees[i]
minRmse = rmse[i]
break
print(minTree, minRmse)
In [13]:
from ggplot import *
%matplotlib inline
scoresRF = pd.DataFrame({
'numTrees': pd.Series(numTrees),
'RMSE': pd.Series(rmse),
'R2': pd.Series(r2)})
ggplot(aes(x='numTrees', y='RMSE'), data=scoresRF) + geom_line()
Out[13]:
In [25]:
# gredient boosting regressor with least square loss function
params = {'n_estimators': 2000, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}
# cross-validate to determine how many trees to use
def heldoutScore(gbm, X, y):
"""compute deviance scores on X_test and y_test."""
scores = np.zeros(params['n_estimators'])
for i, y_pred in enumerate(gbm.staged_predict(X)):
scores[i] = gbm.loss_(y, y_pred)
return scores
def cvGBM(n_folds=5):
"""cross-validation for GBM"""
cv = KFold(n=X_train.shape[0], n_folds=n_folds)
gbm = GradientBoostingRegressor(**params)
loss = np.zeros(params['n_estimators'])
for train, test in cv:
gbm.fit(X_train.iloc[train], y_train.iloc[train])
loss += heldoutScore(gbm, X_train.iloc[test], y_train.iloc[test])
loss /= n_folds
return loss
# Estimate best n_estimator using cross-validation
cvScores = cvGBM()
x = np.arange(1, params['n_estimators'] + 1)
cvBestIter = x[np.argmin(cvScores)]
In [29]:
gbm = GradientBoostingRegressor(**params)
gbm.transform
In [93]:
# plot cv deviance
scoresGbmCV = pd.DataFrame({
'numTrees': pd.Series(x),
'deviance': pd.Series(cvScores)})
ggplot(aes(x='numTrees', y='deviance'), data=scoresGbmCV) + geom_line() + labs(title='GBM CV Deviance')
Out[93]:
In [95]:
# test on test set with cv result
params = {'n_estimators': cvBestIter, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}
gbm = GradientBoostingRegressor(**params)
gbm.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, gbm.predict(X_test)))
print(rmse)
# get stage loss
loss = np.zeros(params['n_estimators'])
for i, pred in enumerate(gbm.staged_predict(X_test)):
loss[i] = gbm.loss_(y_test, pred)
In [100]:
# plot test deviance
scoresGBM = pd.DataFrame({
'numTrees': pd.Series(x),
'deviance': pd.Series(loss)})
ggplot(aes(x='numTrees', y='deviance'), data=scoresGBM) + geom_line() + labs(title='GBM Test Deviance')
Out[100]:
In [115]:
# write model to pickle for later use
from sklearn.externals import joblib
joblib.dump(gbm, 'gbm.pkl', compress=9)
Out[115]: