Slightly more advanced notebook that fits the restaurants revenue data sets using RFR with a grid optimal parameters searching

Import libraries and prepare the data


In [1]:
## Similar to Regressors_simple...

import pandas as pd
import numpy as np
import csv as csv
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder  
import scipy as sp
import re
import sklearn
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib 
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from sklearn.svm import SVR


%matplotlib inline

trainData = pd.read_csv('data/train.csv', header=0, parse_dates = [1])
testData = pd.read_csv('data/test.csv', header=0, parse_dates = [1])

# Replace 'Open Date' by a feature representing the age of the resturant in years
# Replace 'Type', 'City' and 'City Group' by integer indicators 
trainData['Open Date'] = (datetime.now() - trainData['Open Date']).astype('timedelta64[D]') / 365   
trainData['Type'] = LabelEncoder().fit_transform(trainData['Type'])
trainData['City Group'] = LabelEncoder().fit_transform(trainData['City Group'])
trainData['City'] = LabelEncoder().fit_transform(trainData['City'])
# Separate the Y array
Y_train = trainData['revenue']
# Drop the Id and Y variable to create the finale X array to be fitted
X_train = trainData.drop(['Id','revenue'], axis=1) 


# Same for Test data
testData['Open Date'] = (datetime.now() - testData['Open Date']).astype('timedelta64[D]') / 365   
testData['Type'] = LabelEncoder().fit_transform(testData['Type'])
testData['City Group'] = LabelEncoder().fit_transform(testData['City Group'])
testData['City'] = LabelEncoder().fit_transform(testData['City'])
ids = testData['Id'].values
testData = testData.drop(['Id'], axis=1)


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/chelsea/miniconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Grid search the parameters space and fit a Random Forest


In [2]:
# Define the parameters grid to search
param_grid = {'n_estimators':[100,1000],
'max_depth': [1,2,4],
'min_samples_leaf': [1, 3, 5],
'max_features': [1.0, 0.3, 0.1]}
est = RandomForestRegressor()
gs_cv = GridSearchCV(est, param_grid,n_jobs=-1, cv=10).fit(X_train, Y_train)

# print best fit parameters
gs_cv.best_params_


# Creating a RFR with the best fit parameters (entered manually)
forest=RandomForestRegressor(max_depth= 4, max_features= 0.1, min_samples_leaf= 3, n_estimators= 100)
# Fit the training data
forest=forest.fit(X_train,Y_train )
# Predict the testing data
output = forest.predict(testData)


# Write into submission file
predictions_file = open("interRF.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

Grid search the parameters space and fit a Gradient boosting


In [3]:
# Define the parameters grid to search, notice the learning_rate parameter
param_grid2 = {'n_estimators':[100,1000],
'max_depth': [1,2,4],
'learning_rate': [0.1,0.01],               
'min_samples_leaf': [1, 3, 5],
'max_features': [1.0, 0.3, 0.1]}
est2 = GradientBoostingRegressor()
gs_cv2 = GridSearchCV(est2, param_grid2,n_jobs=-1, cv=10).fit(X_train, Y_train)

# print best fit parameters
gs_cv2.best_params_



# Creating a GBR with the best fit parameters (entered manually)
gbr=GradientBoostingRegressor(max_depth= 4, max_features= 0.1, min_samples_leaf= 1, n_estimators= 100,learning_rate=0.01)
# Fit the training data
gbr=gbr.fit(X_train,Y_train )
# Predict the testing data
output = gbr.predict(testData)



# Write into submission file
predictions_file = open("interGB.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

In [ ]: