In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from random import randint
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand

train = pd.read_csv('./data/allstate/train.csv')
test = pd.read_csv('./data/allstate/test.csv')

test['loss'] = np.nan
joined = pd.concat([train, test])

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))


cat_feature = [n for n in joined.columns if n.startswith('cat')]    
cont_feature = [n for n in joined.columns if n.startswith('cont')] 

for column in cat_feature:
        joined[column] = pd.factorize(joined[column].values, sort=True)[0]

    
train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]
shift = 202
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
xgfull = xgb.DMatrix(X, label=y)

def score(params):           
    print("Training with params : ")
    print(params)

    final_fold_prediction= []
    final_fold_real = []

    prediction = np.zeros(X.shape[0])
    
    n_folds  = params["n_folds"]
    del(params["n_folds"])
    
    kf = KFold(X.shape[0], n_folds=n_folds)
    
    for i, (train_index, test_index) in enumerate(kf):        
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]    
          
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        xgtrain_2 = xgb.DMatrix(X_val, label=y_val)
        
        watchlist = [(xgtrain, 'train'), (xgtrain_2, 'eval')]                    

        model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, 
        early_stopping_rounds=300, verbose_eval=200)        

        X_val = xgb.DMatrix(X_val) 
        temp_serises = pd.Series(np.exp(model.predict(X_val))-shift)
        final_fold_prediction.append( temp_serises )
        temp_serises = np.exp(y_val) -shift
        final_fold_real.append(temp_serises )
        
        temp_cv_score = mean_absolute_error(np.exp(model.predict(X_val))-shift, np.exp(y_val) -shift)
        print("Fold {} score: {}".format(i + 1, temp_cv_score))
        
        prediction += np.exp(model.predict(xgfull)) - shift

    prediction = prediction/n_folds
    score =  mean_absolute_error(y, prediction)
        
    print("\tMAE {0}\n\n".format(score))    
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    RANDOM_STATE = 2016
    space = {
             'min_child_weight' : hp.quniform('min_child_weight', 0.01, 1.0, 0.01),
             'eta' : hp.quniform('eta', 0.001, 1.0, 0.01),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.01, 1.0, 0.01),
             'max_depth' : hp.choice('batch_size', np.arange(1, 20, dtype=int)),   
             'subsample' : hp.quniform('subsample', 0.01, 1.0, 0.01),
             'alpha' : hp.quniform('alpha', 0.01, 1.0, 0.01),
             'gamma' : hp.quniform('gamma', 0.01, 1.0, 0.01),                        
             'seed' : RANDOM_STATE,
             'n_folds' : hp.choice('n_folds', np.arange(2, 10, dtype=int)), 
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print("Best params predicted by hyperopt are:")    
    print(best)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)


/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Training with params : 
{'seed': 2016, 'max_depth': 8, 'eta': 0.08, 'subsample': 0.59, 'gamma': 0.67, 'min_child_weight': 0.1, 'alpha': 0.36, 'n_folds': 9, 'colsample_bytree': 0.64}
[0]	train-mae:3212.42	eval-mae:3228.95
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1051.42	eval-mae:1164.25
[400]	train-mae:997.237	eval-mae:1161.89
[600]	train-mae:958.671	eval-mae:1163.49
Stopping. Best iteration:
[368]	train-mae:1003.76	eval-mae:1160.9

Fold 1 score: 1165.2361656730695
[0]	train-mae:3216.95	eval-mae:3192.67
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1052.28	eval-mae:1142.67
[400]	train-mae:998.418	eval-mae:1139.47
[600]	train-mae:961.788	eval-mae:1141.11
Stopping. Best iteration:
[471]	train-mae:984.085	eval-mae:1139.07

Fold 2 score: 1143.6871211923396
[0]	train-mae:3214	eval-mae:3215.51
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1053.02	eval-mae:1151.09
[400]	train-mae:998.065	eval-mae:1148.19
[600]	train-mae:960.488	eval-mae:1149.17
Stopping. Best iteration:
[434]	train-mae:990.941	eval-mae:1147.87

Fold 3 score: 1150.1260428234489
[0]	train-mae:3212.79	eval-mae:3225.54
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1051.6	eval-mae:1157.66
[400]	train-mae:999.266	eval-mae:1157.88
[600]	train-mae:962.318	eval-mae:1159.85
Stopping. Best iteration:
[303]	train-mae:1022.17	eval-mae:1155.37

Fold 4 score: 1159.817922639112
[0]	train-mae:3212.74	eval-mae:3225.77
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1051.03	eval-mae:1156.49
[400]	train-mae:998.482	eval-mae:1153.64
[600]	train-mae:961.62	eval-mae:1155.29
Stopping. Best iteration:
[336]	train-mae:1012.91	eval-mae:1153.27

Fold 5 score: 1155.123080531729
[0]	train-mae:3214.47	eval-mae:3211.67
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1053.96	eval-mae:1147.14
[400]	train-mae:1002.61	eval-mae:1144.94
Stopping. Best iteration:
[267]	train-mae:1034.36	eval-mae:1144.13

Fold 6 score: 1145.846348365546
[0]	train-mae:3212.68	eval-mae:3226.06
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1052.08	eval-mae:1156.74
[400]	train-mae:1000.7	eval-mae:1154.64
[600]	train-mae:963.158	eval-mae:1156.2
Stopping. Best iteration:
[358]	train-mae:1009.23	eval-mae:1154.28

Fold 7 score: 1156.895530760482
[0]	train-mae:3215.12	eval-mae:3206.59
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1053.29	eval-mae:1142.98
[400]	train-mae:1000.94	eval-mae:1140.05
[600]	train-mae:964.684	eval-mae:1141.67
Stopping. Best iteration:
[393]	train-mae:1002.12	eval-mae:1139.89

Fold 8 score: 1142.9182317003858
[0]	train-mae:3216.52	eval-mae:3194.98
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:1053.84	eval-mae:1143.62
[400]	train-mae:1001.81	eval-mae:1142.53
Stopping. Best iteration:
[258]	train-mae:1034.03	eval-mae:1141.27

Fold 9 score: 1144.2843226489645
	MAE 2726.240789107236


Training with params : 
{'seed': 2016, 'max_depth': 15, 'eta': 0.22, 'subsample': 0.16, 'gamma': 0.56, 'min_child_weight': 0.23, 'alpha': 0.4, 'n_folds': 6, 'colsample_bytree': 0.52}
[0]	train-mae:1786.57	eval-mae:1780.52
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:960.431	eval-mae:1464.45
Stopping. Best iteration:
[16]	train-mae:1125.8	eval-mae:1267.16

Fold 1 score: 1542.9197823746165
[0]	train-mae:1796.72	eval-mae:1789.2
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:964.118	eval-mae:1460.95
Stopping. Best iteration:
[13]	train-mae:1146.56	eval-mae:1260.43

Fold 2 score: 1527.080681284188
[0]	train-mae:1774.12	eval-mae:1776.21
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:959.682	eval-mae:1466.85
Stopping. Best iteration:
[17]	train-mae:1132.55	eval-mae:1267.78

Fold 3 score: 1554.7367365207283
[0]	train-mae:1802.09	eval-mae:1801.95
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:957.827	eval-mae:1461.53
Stopping. Best iteration:
[16]	train-mae:1134.29	eval-mae:1264.16

Fold 4 score: 1534.5662843682808
[0]	train-mae:1801.66	eval-mae:1808.54
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:954.127	eval-mae:1481.84
Stopping. Best iteration:
[19]	train-mae:1121.84	eval-mae:1261.27

Fold 5 score: 1548.0705671234025
[0]	train-mae:1802.5	eval-mae:1807.79
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
[200]	train-mae:956.29	eval-mae:1458.86
Stopping. Best iteration:
[16]	train-mae:1135.21	eval-mae:1249.86

Fold 6 score: 1531.5831262005174
	MAE 2912.094075891125


Training with params : 
{'seed': 2016, 'max_depth': 10, 'eta': 0.73, 'subsample': 0.51, 'gamma': 0.29, 'min_child_weight': 0.67, 'alpha': 0.15, 'n_folds': 8, 'colsample_bytree': 0.56}
[0]	train-mae:1.5091e+12	eval-mae:1.53563e+12
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 300 rounds.
/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel/__main__.py:26: RuntimeWarning: overflow encountered in exp
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-9ad4a4c5a1d9> in <module>()
    108 trials = Trials()
    109 
--> 110 optimize(trials)

<ipython-input-1-9ad4a4c5a1d9> in optimize(trials)
    100              }
    101 
--> 102     best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)
    103 
    104     print("Best params predicted by hyperopt are:")

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    304             verbose=verbose,
    305             catch_eval_exceptions=catch_eval_exceptions,
--> 306             return_argmin=return_argmin,
    307             )
    308 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/base.py in fmin(self, fn, space, algo, max_evals, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin)
    631             pass_expr_memo_ctrl=pass_expr_memo_ctrl,
    632             catch_eval_exceptions=catch_eval_exceptions,
--> 633             return_argmin=return_argmin)
    634 
    635 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    317                     verbose=verbose)
    318     rval.catch_eval_exceptions = catch_eval_exceptions
--> 319     rval.exhaust()
    320     if return_argmin:
    321         return trials.argmin

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in exhaust(self)
    196     def exhaust(self):
    197         n_done = len(self.trials)
--> 198         self.run(self.max_evals - n_done, block_until_done=self.async)
    199         self.trials.refresh()
    200         return self

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in run(self, N, block_until_done)
    170             else:
    171                 # -- loop over trials and do the jobs directly
--> 172                 self.serial_evaluate()
    173 
    174             if stopped:

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in serial_evaluate(self, N)
     87                 ctrl = base.Ctrl(self.trials, current_trial=trial)
     88                 try:
---> 89                     result = self.domain.evaluate(spec, ctrl)
     90                 except Exception as e:
     91                     logger.info('job exception: %s' % str(e))

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/base.py in evaluate(self, config, ctrl, attach_attachments)
    836                 memo=memo,
    837                 print_node_on_error=self.rec_eval_print_node_on_error)
--> 838             rval = self.fn(pyll_rval)
    839 
    840         if isinstance(rval, (float, int, np.number)):

<ipython-input-1-9ad4a4c5a1d9> in score(params)
     67 
     68         model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, 
---> 69         early_stopping_rounds=300, verbose_eval=200)        
     70 
     71         X_val = xgb.DMatrix(X_val)

/home/arvc/xgboost/python-package/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    201                            evals=evals,
    202                            obj=obj, feval=feval,
--> 203                            xgb_model=xgb_model, callbacks=callbacks)
    204 
    205 

/home/arvc/xgboost/python-package/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     82         # check evaluation result.
     83         if len(evals) != 0:
---> 84             bst_eval_set = bst.eval_set(evals, i, feval)
     85             if isinstance(bst_eval_set, STRING_TYPES):
     86                 msg = bst_eval_set

/home/arvc/xgboost/python-package/xgboost/core.py in eval_set(self, evals, iteration, feval)
    883             res = '[%d]' % iteration
    884             for dmat, evname in evals:
--> 885                 feval_ret = feval(self.predict(dmat), dmat)
    886                 if isinstance(feval_ret, list):
    887                     for name, val in feval_ret:

<ipython-input-1-9ad4a4c5a1d9> in evalerror(preds, dtrain)
     24 def evalerror(preds, dtrain):
     25     labels = dtrain.get_label()
---> 26     return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
     27 
     28 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/metrics/regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    161     """
    162     y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 163         y_true, y_pred, multioutput)
    164     output_errors = np.average(np.abs(y_pred - y_true),
    165                                weights=sample_weight, axis=0)

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/metrics/regression.py in _check_reg_targets(y_true, y_pred, multioutput)
     73     """
     74     check_consistent_length(y_true, y_pred)
---> 75     y_true = check_array(y_true, ensure_2d=False)
     76     y_pred = check_array(y_pred, ensure_2d=False)
     77 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [ ]: