Very simple notebook that fits the restaurants revenue data sets using RFR and GBR

Import libraries and read training and testing data files


In [1]:
import pandas as pd
import csv as csv
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder  
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm

trainData = pd.read_csv('data/train.csv', header=0, parse_dates = [1])
testData = pd.read_csv('data/test.csv', header=0, parse_dates = [1])

Convert 'Open date' , 'City', 'City Group, and 'Type' to numerical values


In [2]:
# Replace 'Open Date' by a feature representing the age of the resturant in years
# Replace 'Type', 'City' and 'City Group' by integer indicators 
trainData['Open Date'] = (datetime.now() - trainData['Open Date']).astype('timedelta64[D]') / 365   
trainData['Type'] = LabelEncoder().fit_transform(trainData['Type'])
trainData['City Group'] = LabelEncoder().fit_transform(trainData['City Group'])
trainData['City'] = LabelEncoder().fit_transform(trainData['City'])
# Separate the Y array
Y_train = trainData['revenue']
# Drop the Id and Y variable to create the finale X array to be fitted
X_train = trainData.drop(['Id','revenue'], axis=1) 

# Same for Test data
testData['Open Date'] = (datetime.now() - testData['Open Date']).astype('timedelta64[D]') / 365   
testData['Type'] = LabelEncoder().fit_transform(testData['Type'])
testData['City Group'] = LabelEncoder().fit_transform(testData['City Group'])
testData['City'] = LabelEncoder().fit_transform(testData['City'])
ids = testData['Id'].values
testData = testData.drop(['Id'], axis=1) 


# Note: You need to restart the Kernel after any modifications to this cell

Simple Random Forest Regressor fit and Submission file


In [3]:
# Creating a RFR with mostly default parameters
forest=RandomForestRegressor(n_estimators=1000, criterion='mse', max_depth=None, min_samples_split=2, 
                                       min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                                       max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1, 
                                       random_state=None, verbose=0, warm_start=False)
# Fit the training data
forest=forest.fit(X_train,Y_train )
# Predict the testing data
output = forest.predict(testData)

# Write into submission file
predictions_file = open("simpleRFR.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'


Done.

Simple Gradient Boosting Regressor fit and Submission file


In [4]:
# Creating a RFR with mostly default parameters
gbr=GradientBoostingRegressor(max_depth= None, max_features= 'auto', min_samples_leaf=1, n_estimators= 1000, learning_rate= 0.01)
# Fit the training data
gbr=gbr.fit(X_train,Y_train )
# Predict the testing data
output = gbr.predict(testData)

# Write into submission file
predictions_file = open("simpleGBR.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'


Done.

In [ ]: