In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time
import os
from random import randint
from sklearn.cross_validation import train_test_split

from sklearn.metrics import mean_absolute_error
from utils import get_allstate_train_valid_test_testids, to_xy


shift = 200

def custom_objective_fn(preds, dtrain):
    labels = dtrain.get_label()
    con = 20
    
    #x = (np.exp(preds)-shift) - (np.exp(labels)-shift)
    x = preds - labels
    
    gradient = con*x / (np.abs(x) + con)
    hess = con**2 / (np.abs(x) + con)**2
    
    return gradient, hess
    

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds)-shift, np.exp(labels)-shift)

train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
x_train, y_train = to_xy(train, "loss")
x_valid, y_valid = to_xy(valid, "loss")
x_test, y_test = to_xy(test, "loss")


/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Train shape is: (188318, 132)
Test shape is: (125546, 131)
/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:141: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)
Final Train shape is: (160070, 1191)
Final Valid shape is: (28248, 1191)
Final Test shape is: (125546, 1191)
float64
float64
float64

In [2]:
x_train.shape


Out[2]:
(160070, 1190)

In [3]:
xgtrain = xgb.DMatrix(x_train, label=y_train)
xgvalid = xgb.DMatrix(x_valid, label=y_valid)
xgtest = xgb.DMatrix(x_test)

#best params on 11/1 for 85% train data: {'subsample': 1.0, 'n_estimators': 174.0, 'eta': 0.1, 
#'colsample_bytree': 0.4, 'gamma': 0.2, 'min_child_weight': 1.0, 'max_depth': 3}

RANDOM_STATE = randint(1,429496)
params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'silent': 1,
    'verbose_eval': False,
    'seed': RANDOM_STATE,
    'eval_metric': 'mae',
    'objective': 'reg:linear',
}

#model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)
#model = xgb.train(params, xgtrain, 174, feval=evalerror)
evallist  = [(xgvalid, 'eval')]
model = xgb.train(params, xgtrain, 2240, evals=evallist, early_stopping_rounds=50, verbose_eval=False, 
                  feval=evalerror, obj=custom_objective_fn)

#model = xgb.cv(params, xgtrain, 2240, early_stopping_rounds=50, verbose_eval=False, 
#                  feval=evalerror, obj=custom_objective_fn)

train_predictions =  model.predict(xgtrain)
valid_predictions = model.predict(xgvalid)
mae_train = mean_absolute_error(np.exp(y_train) - shift, np.exp(train_predictions) - shift)
mae_valid = mean_absolute_error(np.exp(y_valid) - shift, np.exp(valid_predictions) - shift)
print("MAE score on training data = {}".format(mae_train))
print("MAE score on validation data = {}".format(mae_valid))

def make_submission(selected_model):
    prediction = np.exp(selected_model.predict(xgtest)) - shift
    submission = pd.DataFrame()    
    submission['id'] = testids
    submission['loss'] = prediction
    timestr = time.strftime("%Y%m%d-%H%M%S")
    submission.to_csv("./data/allstate/sub_xgboost_{}.csv".format(timestr), index=False)
    
#MAE score on training data = 972.9862923525721
#MAE score on validation data = 1138.1939778279811


MAE score on training data = 989.2601928710938
MAE score on validation data = 1129.317138671875

In [4]:
model


Out[4]:
<xgboost.core.Booster at 0x7f8505b3ffd0>

In [5]:
print(model.model_performance(train).mse())


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-5-7c5856eaac63> in <module>()
----> 1 print(model.model_performance(train).mse())

AttributeError: 'Booster' object has no attribute 'model_performance'

In [6]:
make_submission(model)

In [ ]: