In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.cross_validation import KFold
In [4]:
data_path = '/devdata/course_proj/aml/CitiBot/data/'
In [5]:
train = pd.read_csv(data_path+'train.csv').ix[:,1:]
testSet = pd.read_csv(data_path+'test.csv').ix[:,1:]
# encode features for random forest and concatenate them back to dataframe
train = pd.concat([train, train['season'].str.get_dummies(sep=',')], axis=1)
train = pd.concat([train, train['dayOfWeek'].str.get_dummies(sep=',')], axis=1)
testSet = pd.concat([testSet, testSet['season'].str.get_dummies(sep=',')], axis=1)
testSet = pd.concat([testSet, testSet['dayOfWeek'].str.get_dummies(sep=',')], axis=1)
# we don't have 'spring', 'summer', 'fall' in current test set
cols = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
'holiday','winter',
'max', 'min', 'rain' ,'snow']
X_train = train[cols]
y_train = train.visited
X_test = testSet[cols]
y_test = testSet.visited
In [6]:
#gredient boosting regressor For Feature Extraction
def gbm_features(traned_gbm, data_x):
# gredient boosting regressor with least square loss function
X_gbm = gbm.transform(data_x)
return X_gbm
In [14]:
params = {'n_estimators': 200, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}
gbm = GradientBoostingRegressor(**params)
gbm.fit(X_train.values, y_train.values)
Out[14]:
In [17]:
X_gbm = gbm.transform(X_test.values)
In [19]:
Out[19]: