In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time
import os
from random import randint
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error
from utils import get_allstate_train_valid_test_testids, to_xy
shift = 200
def custom_objective_fn(preds, dtrain):
labels = dtrain.get_label()
con = 20
#x = (np.exp(preds)-shift) - (np.exp(labels)-shift)
x = preds - labels
gradient = con*x / (np.abs(x) + con)
hess = con**2 / (np.abs(x) + con)**2
return gradient, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(preds)-shift, np.exp(labels)-shift)
train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
x_train, y_train = to_xy(train, "loss")
x_valid, y_valid = to_xy(valid, "loss")
x_test, y_test = to_xy(test, "loss")
In [2]:
x_train.shape
Out[2]:
In [3]:
xgtrain = xgb.DMatrix(x_train, label=y_train)
xgvalid = xgb.DMatrix(x_valid, label=y_valid)
xgtest = xgb.DMatrix(x_test)
#best params on 11/1 for 85% train data: {'subsample': 1.0, 'n_estimators': 174.0, 'eta': 0.1,
#'colsample_bytree': 0.4, 'gamma': 0.2, 'min_child_weight': 1.0, 'max_depth': 3}
RANDOM_STATE = randint(1,429496)
params = {
'min_child_weight': 1,
'eta': 0.01,
'colsample_bytree': 0.5,
'max_depth': 12,
'subsample': 0.8,
'alpha': 1,
'gamma': 1,
'silent': 1,
'verbose_eval': False,
'seed': RANDOM_STATE,
'eval_metric': 'mae',
'objective': 'reg:linear',
}
#model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)
#model = xgb.train(params, xgtrain, 174, feval=evalerror)
evallist = [(xgvalid, 'eval')]
model = xgb.train(params, xgtrain, 2240, evals=evallist, early_stopping_rounds=50, verbose_eval=False,
feval=evalerror, obj=custom_objective_fn)
#model = xgb.cv(params, xgtrain, 2240, early_stopping_rounds=50, verbose_eval=False,
# feval=evalerror, obj=custom_objective_fn)
train_predictions = model.predict(xgtrain)
valid_predictions = model.predict(xgvalid)
mae_train = mean_absolute_error(np.exp(y_train) - shift, np.exp(train_predictions) - shift)
mae_valid = mean_absolute_error(np.exp(y_valid) - shift, np.exp(valid_predictions) - shift)
print("MAE score on training data = {}".format(mae_train))
print("MAE score on validation data = {}".format(mae_valid))
def make_submission(selected_model):
prediction = np.exp(selected_model.predict(xgtest)) - shift
submission = pd.DataFrame()
submission['id'] = testids
submission['loss'] = prediction
timestr = time.strftime("%Y%m%d-%H%M%S")
submission.to_csv("./data/allstate/sub_xgboost_{}.csv".format(timestr), index=False)
#MAE score on training data = 972.9862923525721
#MAE score on validation data = 1138.1939778279811
In [4]:
model
Out[4]:
In [5]:
print(model.model_performance(train).mse())
In [6]:
make_submission(model)
In [ ]: