notebook.community

Edit and run



In [20]:

    
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import explained_variance_score,r2_score,mean_absolute_error
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import sys
import os
from utils import get_allstate_train_valid_test_testids
import h2o
import os
import time
h2o.init(max_mem_size = "40G")             #specify max number of bytes. uses all cores by default.
h2o.remove_all()  
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
from h2o import H2OFrame









    



Checking whether there is an H2O instance running at http://localhost:54321. connected.






    




H2O cluster uptime:
3 days 3 hours 17 mins
H2O cluster version:
3.10.0.9
H2O cluster version age:
10 days 
H2O cluster name:
H2O_from_python_arvc_lgnmyd
H2O cluster total nodes:
1
H2O cluster free memory:
35.31 Gb
H2O cluster total cores:
16
H2O cluster allowed cores:
16
H2O cluster status:
locked, healthy
H2O connection url:
http://localhost:54321
H2O connection proxy:
None
Python version:
3.5.2 final



In [21]:

    
#Read Input CSV file
shift = 203

train, valid, test, testids = get_allstate_train_valid_test_testids(0.15, shift, True)
test["loss"] = 0.0

trainframe = H2OFrame(train)
validframe = H2OFrame(valid)
testframe = H2OFrame(test)
del(train)
del(valid)
del(test)
y = "loss"
x = list(trainframe.columns)









    



Train shape is: (188318, 132)
Test shape is: (125546, 131)






    



/home/arvc/t81_558_deep_learning/utils.py:140: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train.drop("type", axis=1, inplace=True)
/home/arvc/t81_558_deep_learning/utils.py:141: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test.drop("type", axis=1, inplace=True)






    



Final Train shape is: (160070, 1191)
Final Valid shape is: (28248, 1191)
Final Test shape is: (125546, 1191)
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%



In [22]:

    
min_mae = 10000

def score(params):
    print("Training with params : ")
    print(params)
    
    model = H2OGradientBoostingEstimator(
                 ntrees=params["ntrees"],
                 learn_rate=params["learn_rate"],
                 max_depth=params["max_depth"],
                 sample_rate=params["sample_rate"],
                 col_sample_rate=0.7,
                 stopping_rounds=200,
                 stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
                 score_each_iteration=True,
                 seed=200000,
                 nfolds=6,                 
                 keep_cross_validation_predictions=True
                )
    
   
    predictions = model.predict(validframe)
    score =  mean_absolute_error(np.exp(validframe["loss"].as_data_frame()) - shift, np.exp(predictions.as_data_frame()) - shift)
    if score < min_mae:
        min_mae = score
        
    print("MAE {}".format(score))
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
             'ntrees' : hp.choice('ntrees', [100, 250, 500, 750,1000,1250, 1500, 1750, 2000]),
             'learn_rate' : hp.quniform('learn_rate', 0.01, 5, 0.01),
             'max_depth' : hp.choice('max_depth', [100, 250, 500, 750,1000,1250, 1500, 1750, 2000,2250]),            
             'sample_rate' : hp.quniform('sample_rate', 0.01, 1, 0.01),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'col_sample_rate' : hp.quniform('col_sample_rate', 0.01, 1, 0.01),
             }
    
    
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=1000)
    print("Best params:")
    print(best)
    print("Min MAE={}".format(min_mae))


#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)









    



Training with params : 
{'gamma': 0.65, 'sample_rate': 0.04, 'learn_rate': 4.62, 'max_depth': 100, 'ntrees': 750, 'col_sample_rate': 0.33}






    



---------------------------------------------------------------------------
H2OResponseError                          Traceback (most recent call last)
<ipython-input-22-5266f37f4ab6> in <module>()
     49 trials = Trials()
     50 
---> 51 optimize(trials)

<ipython-input-22-5266f37f4ab6> in optimize(trials)
     40 
     41 
---> 42     best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=1000)
     43     print("Best params:")
     44     print(best)

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    304             verbose=verbose,
    305             catch_eval_exceptions=catch_eval_exceptions,
--> 306             return_argmin=return_argmin,
    307             )
    308 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/base.py in fmin(self, fn, space, algo, max_evals, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin)
    631             pass_expr_memo_ctrl=pass_expr_memo_ctrl,
    632             catch_eval_exceptions=catch_eval_exceptions,
--> 633             return_argmin=return_argmin)
    634 
    635 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin)
    317                     verbose=verbose)
    318     rval.catch_eval_exceptions = catch_eval_exceptions
--> 319     rval.exhaust()
    320     if return_argmin:
    321         return trials.argmin

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in exhaust(self)
    196     def exhaust(self):
    197         n_done = len(self.trials)
--> 198         self.run(self.max_evals - n_done, block_until_done=self.async)
    199         self.trials.refresh()
    200         return self

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in run(self, N, block_until_done)
    170             else:
    171                 # -- loop over trials and do the jobs directly
--> 172                 self.serial_evaluate()
    173 
    174             if stopped:

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/fmin.py in serial_evaluate(self, N)
     87                 ctrl = base.Ctrl(self.trials, current_trial=trial)
     88                 try:
---> 89                     result = self.domain.evaluate(spec, ctrl)
     90                 except Exception as e:
     91                     logger.info('job exception: %s' % str(e))

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/hyperopt/base.py in evaluate(self, config, ctrl, attach_attachments)
    836                 memo=memo,
    837                 print_node_on_error=self.rec_eval_print_node_on_error)
--> 838             rval = self.fn(pyll_rval)
    839 
    840         if isinstance(rval, (float, int, np.number)):

<ipython-input-22-5266f37f4ab6> in score(params)
     20 
     21 
---> 22     predictions = model.predict(validframe)
     23     score =  mean_absolute_error(np.exp(validframe["loss"].as_data_frame()) - shift, np.exp(predictions.as_data_frame()) - shift)
     24     if score < min_mae:

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h2o/model/model_base.py in predict(self, test_data)
    146         """
    147         if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
--> 148         j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)),
    149                    self._model_json['algo'] + " prediction")
    150         j.poll()

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h2o/h2o.py in api(endpoint, data, json, filename, save_to)
     82     # type checks are performed in H2OConnection class
     83     _check_connection()
---> 84     return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to)
     85 
     86 

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h2o/backend/connection.py in request(self, endpoint, data, json, filename, save_to)
    257                                     auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)
    258             self._log_end_transaction(start_time, resp)
--> 259             return self._process_response(resp, save_to)
    260 
    261         except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:

/home/arvc/anaconda3/envs/tensorflow/lib/python3.5/site-packages/h2o/backend/connection.py in _process_response(response, save_to)
    584         # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed")
    585         if status_code in {400, 404, 412} and isinstance(data, (H2OErrorV3, H2OModelBuilderErrorV3)):
--> 586             raise H2OResponseError(data)
    587 
    588         # Server errors (notably 500 = "Server Error")

H2OResponseError: Server error water.exceptions.H2OKeyNotFoundArgumentException:
  Error: Object 'None' not found in function: predict for argument: model
  Request: POST /4/Predictions/models/None/frames/Key_Frame__upload_8fe6e9bca7ea6ddf6fe559dff4ec4da1.hex



In [ ]:

H2O cluster uptime:	3 days 3 hours 17 mins
H2O cluster version:	3.10.0.9
H2O cluster version age:	10 days
H2O cluster name:	H2O_from_python_arvc_lgnmyd
H2O cluster total nodes:	1
H2O cluster free memory:	35.31 Gb
H2O cluster total cores:	16
H2O cluster allowed cores:	16
H2O cluster status:	locked, healthy
H2O connection url:	http://localhost:54321
H2O connection proxy:	None
Python version:	3.5.2 final