In [1]:
import pandas as pd
import csv as csv
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor #GBM algorithm
trainData = pd.read_csv('data/train.csv', header=0, parse_dates = [1])
testData = pd.read_csv('data/test.csv', header=0, parse_dates = [1])
In [2]:
# Replace 'Open Date' by a feature representing the age of the resturant in years
# Replace 'Type', 'City' and 'City Group' by integer indicators
trainData['Open Date'] = (datetime.now() - trainData['Open Date']).astype('timedelta64[D]') / 365
trainData['Type'] = LabelEncoder().fit_transform(trainData['Type'])
trainData['City Group'] = LabelEncoder().fit_transform(trainData['City Group'])
trainData['City'] = LabelEncoder().fit_transform(trainData['City'])
# Separate the Y array
Y_train = trainData['revenue']
# Drop the Id and Y variable to create the finale X array to be fitted
X_train = trainData.drop(['Id','revenue'], axis=1)
# Same for Test data
testData['Open Date'] = (datetime.now() - testData['Open Date']).astype('timedelta64[D]') / 365
testData['Type'] = LabelEncoder().fit_transform(testData['Type'])
testData['City Group'] = LabelEncoder().fit_transform(testData['City Group'])
testData['City'] = LabelEncoder().fit_transform(testData['City'])
ids = testData['Id'].values
testData = testData.drop(['Id'], axis=1)
# Note: You need to restart the Kernel after any modifications to this cell
In [3]:
# Creating a RFR with mostly default parameters
forest=RandomForestRegressor(n_estimators=1000, criterion='mse', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1,
random_state=None, verbose=0, warm_start=False)
# Fit the training data
forest=forest.fit(X_train,Y_train )
# Predict the testing data
output = forest.predict(testData)
# Write into submission file
predictions_file = open("simpleRFR.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
In [4]:
# Creating a RFR with mostly default parameters
gbr=GradientBoostingRegressor(max_depth= None, max_features= 'auto', min_samples_leaf=1, n_estimators= 1000, learning_rate= 0.01)
# Fit the training data
gbr=gbr.fit(X_train,Y_train )
# Predict the testing data
output = gbr.predict(testData)
# Write into submission file
predictions_file = open("simpleGBR.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Id","Prediction"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
In [ ]: