In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import KFold

data_path = '/devdata/course_proj/aml/'

In [2]:
train = pd.read_csv(data_path+'train.csv').ix[:,1:]
testSet = pd.read_csv(data_path+'test.csv').ix[:,1:]

In [ ]:


In [3]:
# encode features for random forest and concatenate them back to dataframe
train = pd.concat([train, train['season'].str.get_dummies(sep=',')], axis=1)
train = pd.concat([train, train['dayOfWeek'].str.get_dummies(sep=',')], axis=1)

testSet = pd.concat([testSet, testSet['season'].str.get_dummies(sep=',')], axis=1)
testSet = pd.concat([testSet, testSet['dayOfWeek'].str.get_dummies(sep=',')], axis=1)

In [8]:
#gredient boosting regressor For Feature Extraction

def gbm_features(traned_gbm, data_x):
    # gredient boosting regressor with least square loss function   
    X_gbm = gbm.transform(data_x)
    return X_gbm

params = {'n_estimators': 200, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}
gbm = GradientBoostingRegressor(**params)
gbm.fit(X_train.values, y_train.values)

X_gbm_train = gbm.transform(X_train.values)
X_gbm_test = gbm.transform(X_test.values)

In [ ]:


In [33]:
frame_x_train = [X_train, pd.DataFrame(X_gbm_train)]
x_train_concat_train = pd.concat(frame_x_train,axis=1)

frame_x_test = [X_test, pd.DataFrame(X_gbm_test)]
x_train_concat_test = pd.concat(frame_x_test,axis=1)

In [4]:
# we don't have 'spring', 'summer', 'fall' in current test set
cols = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 
        'holiday','winter', 'stationID',
        'max', 'min', 'rain' ,'snow']

X_train = train[cols]
y_train = train.visited
X_test = testSet[cols]
y_test = testSet.visited

In [34]:
# random forest regressor

# deterine how many trees to use
rmse = []
r2 = []
# numTrees = range(50,100)

# for i in numTrees:
#     print i

rf = RandomForestRegressor(n_estimators=200)
rf.fit(x_train_concat_train, y_train)
r2.append(r2_score(y_test, rf.predict(x_train_concat_test)))
rmse.append(np.sqrt(np.mean((y_test - rf.predict(x_train_concat_test))**2)))
print rmse


[54.031523136788032]

In [ ]:
from sklearn.svm import SVR

svr_rmse = []
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)
svr_rmse.append(np.sqrt(np.mean((y_test - y_rbf)**2)))
print svr_rmse

In [11]:
bestTreeNum = numTrees[np.argmin(rmse)]
# 1 standard deviation rule
std = np.std(rmse)
for i in range(len(rmse)):
    if rmse[i] < min(rmse) + std:
        minTree = numTrees[i]
        minRmse = rmse[i]
        break
print(minTree, minRmse)


(54, 53.453863891032483)

In [13]:
from ggplot import *
%matplotlib inline

scoresRF = pd.DataFrame({
        'numTrees': pd.Series(numTrees),
        'RMSE': pd.Series(rmse),
        'R2': pd.Series(r2)})

ggplot(aes(x='numTrees', y='RMSE'), data=scoresRF) + geom_line()


Out[13]:
<ggplot: (8775029238001)>

In [25]:
# gredient boosting regressor with least square loss function
params = {'n_estimators': 2000, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}

# cross-validate to determine how many trees to use
def heldoutScore(gbm, X, y):
    """compute deviance scores on X_test and y_test."""
    scores = np.zeros(params['n_estimators'])
    for i, y_pred in enumerate(gbm.staged_predict(X)):
        scores[i] = gbm.loss_(y, y_pred)
    return scores

def cvGBM(n_folds=5):
    """cross-validation for GBM"""
    cv = KFold(n=X_train.shape[0], n_folds=n_folds)
    gbm = GradientBoostingRegressor(**params)
    loss = np.zeros(params['n_estimators'])
    for train, test in cv:
        gbm.fit(X_train.iloc[train], y_train.iloc[train])
        loss += heldoutScore(gbm, X_train.iloc[test], y_train.iloc[test])
    loss /= n_folds
    return loss

# Estimate best n_estimator using cross-validation
cvScores = cvGBM()

x = np.arange(1, params['n_estimators'] + 1)
cvBestIter = x[np.argmin(cvScores)]

In [29]:
gbm = GradientBoostingRegressor(**params)
gbm.transform

In [93]:
# plot cv deviance
scoresGbmCV = pd.DataFrame({
            'numTrees': pd.Series(x),
            'deviance': pd.Series(cvScores)})
    
ggplot(aes(x='numTrees', y='deviance'), data=scoresGbmCV) + geom_line() + labs(title='GBM CV Deviance')


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[93]:
<ggplot: (279683206)>

In [95]:
# test on test set with cv result
params = {'n_estimators': cvBestIter, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}

gbm = GradientBoostingRegressor(**params)

gbm.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, gbm.predict(X_test)))
print(rmse)

# get stage loss
loss = np.zeros(params['n_estimators'])

for i, pred in enumerate(gbm.staged_predict(X_test)):
    loss[i] = gbm.loss_(y_test, pred)


53.2858846023

In [100]:
# plot test deviance
scoresGBM = pd.DataFrame({
            'numTrees': pd.Series(x),
            'deviance': pd.Series(loss)})
    
ggplot(aes(x='numTrees', y='deviance'), data=scoresGBM) + geom_line() + labs(title='GBM Test Deviance')


/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[100]:
<ggplot: (279695915)>

In [115]:
# write model to pickle for later use
from sklearn.externals import joblib
joblib.dump(gbm, 'gbm.pkl', compress=9)


Out[115]:
['gbm.pkl']