In [1]:
## Similar to Regressors_simple...
import pandas as pd
import numpy as np
import csv as csv
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import scipy as sp
import re
import sklearn
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier #GBM algorithm
from sklearn.ensemble import GradientBoostingRegressor #GBM algorithm
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
from sklearn.svm import SVR
%matplotlib inline
trainData = pd.read_csv('data/train.csv', header=0, parse_dates = [1])
testData = pd.read_csv('data/test.csv', header=0, parse_dates = [1])
# Replace 'Open Date' by a feature representing the age of the resturant in years
# Replace 'Type', 'City' and 'City Group' by integer indicators
trainData['Open Date'] = (datetime.now() - trainData['Open Date']).astype('timedelta64[D]') / 365
trainData['Type'] = LabelEncoder().fit_transform(trainData['Type'])
trainData['City Group'] = LabelEncoder().fit_transform(trainData['City Group'])
trainData['City'] = LabelEncoder().fit_transform(trainData['City'])
# Separate the Y array
Y_train = trainData['revenue']
# Drop the Id and Y variable to create the finale X array to be fitted
X_train = trainData.drop(['Id','revenue'], axis=1)
# Same for Test data
testData['Open Date'] = (datetime.now() - testData['Open Date']).astype('timedelta64[D]') / 365
testData['Type'] = LabelEncoder().fit_transform(testData['Type'])
testData['City Group'] = LabelEncoder().fit_transform(testData['City Group'])
testData['City'] = LabelEncoder().fit_transform(testData['City'])
ids = testData['Id'].values
testData = testData.drop(['Id'], axis=1)
In [2]:
# Define the parameters grid to search
param_grid = {'n_estimators':[100,1000],
'max_depth': [1,2,4],
'min_samples_leaf': [1, 3, 5],
'max_features': [1.0, 0.3, 0.1]}
est = RandomForestRegressor()
gs_cv = GridSearchCV(est, param_grid,n_jobs=-1, cv=10).fit(X_train, Y_train)
# print best fit parameters
gs_cv.best_params_
# Creating a RFR with the best fit parameters (entered manually)
forest=RandomForestRegressor(max_depth= 4, max_features= 0.1, min_samples_leaf= 3, n_estimators= 100)
# Fit the training data
forest=forest.fit(X_train,Y_train )
# Predict the testing data
output = forest.predict(testData)
# Write into submission file
predictions_file = open("interRF.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
In [3]:
# Define the parameters grid to search, notice the learning_rate parameter
param_grid2 = {'n_estimators':[100,1000],
'max_depth': [1,2,4],
'learning_rate': [0.1,0.01],
'min_samples_leaf': [1, 3, 5],
'max_features': [1.0, 0.3, 0.1]}
est2 = GradientBoostingRegressor()
gs_cv2 = GridSearchCV(est2, param_grid2,n_jobs=-1, cv=10).fit(X_train, Y_train)
# print best fit parameters
gs_cv2.best_params_
# Creating a GBR with the best fit parameters (entered manually)
gbr=GradientBoostingRegressor(max_depth= 4, max_features= 0.1, min_samples_leaf= 1, n_estimators= 100,learning_rate=0.01)
# Fit the training data
gbr=gbr.fit(X_train,Y_train )
# Predict the testing data
output = gbr.predict(testData)
# Write into submission file
predictions_file = open("interGB.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
In [ ]: