In [1]:
import pandas as pd
import ensembles as en
import numpy as np
import xgboost as xgb
import category_encoders as ce
from sklearn import datasets, linear_model, preprocessing, grid_search
from sklearn.preprocessing import Imputer, PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.regularizers import l2, activity_l2
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, accuracy_score, \
mean_absolute_error, mean_squared_error, r2_score
from sklearn.cross_validation import train_test_split
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials 
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from functools import partial
np.random.seed(1338)


/home/prajwal/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/prajwal/anaconda3/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
Using Theano backend.

In [2]:
#Training the base models

Example 1


In [3]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y')
print('Training Data',Data.shape)
print('Test Data',data_test.shape)

en.metric_set('roc_auc_score')

#Hyper Parameter Optimisation (max_depth and eta)
param_gb = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
                                                eval_metric = ['auc'], objective = ['binary:logistic'], \
                                              max_depth = [5, 10, 15], eta = [0.1, 0.3, 0.5])

#Setting max_depth, rest are default values
param_dt = en.parameter_set_decision_tree(max_depth = [6])

en.train_base_models(['gradient_boosting','decision_tree'],[param_gb, param_dt], save_models = True)

weights = en.assign_weights(weights = 'default', hyper_parameter_optimisation = True)

#Stacking
en.train_ensemble_models(stack_model_list = ['gradient_boosting'], stack_parameters_list = [param_gb], 
                      perform_weighted_average = True, weights_list = weights)

test_models(data_test)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-6b6b7728a4b6> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)\ndata_test = en.data_import(Data, label_output='y')\nprint('Training Data',Data.shape)\nprint('Test Data',data_test.shape)\n\nen.metric_set('roc_auc_score')\n\n#Hyper Parameter Optimisation (max_depth and eta)\nparam_gb = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \\\n                                                eval_metric = ['auc'], objective = ['binary:logistic'], \\\n                                              max_depth = [5, 10, 15], eta = [0.1, 0.3, 0.5])\n\n#Setting max_depth, rest are default values\nparam_dt = en.parameter_set_decision_tree(max_depth = [6])\n\nen.train_base_models(['gradient_boosting','decision_tree'],[param_gb, param_dt], save_models = True)\n\nweights = en.assign_weights(weights = 'default', hyper_parameter_optimisation = True)\n\n#Stacking\nen.train_ensemble_models(stack_model_list = ['gradient_boosting'], stack_parameters_list = [param_gb], \n                      perform_weighted_average = True, weights_list = weights)\n\ntest_models(data_test)")

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2118             magic_arg_s = self.var_expand(line, stack_depth)
   2119             with self.builtin_trap:
-> 2120                 result = fn(magic_arg_s, cell)
   2121             return result
   2122 

<decorator-gen-60> in time(self, line, cell, local_ns)

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
   1175         else:
   1176             st = clock2()
-> 1177             exec(code, glob, local_ns)
   1178             end = clock2()
   1179             out = None

<timed exec> in <module>()

NameError: name 'en' is not defined

Example 2


In [ ]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y', encode ='binary', split = True, stratify = False, split_size = 0.1)
print('Training Data',Data.shape)
print('Test Data',data_test.shape)

en.metric_set('roc_auc_score')

#Hyper Parameter Optimisation (gamma and eta)
param_gb = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
                                                eval_metric = ['auc'], objective = ['binary:logistic'], \
                                                gamma = [0, 1, 3, 5, 7], eta = [0.1, 0.3], \
                                                max_depth = [5, 10, 15])

#Setting max_depth, splitter, presort rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - splitter
param_dt_1 = en.parameter_set_decision_tree(max_depth = [6, 10, 12, 15], splitter = ['best', 'random'], \
                                          presort = [True])
#Default Values
param_dt_2 = en.parameter_set_decision_tree()

en.train_base_models(['decision_tree','decision_tree', 'gradient_boosting'], \
                     [param_dt_1, param_dt_2, param_gb])

weights = en.assign_weights(weights = [[2],[1],[3]], hyper_parameter_optimisation = False)


en.train_ensemble_models(['gradient_boosting'], [param_gb],
                      ['gradient_boosting'],[param_gb], 
                      perform_weighted_average = True, weights_list = weights)

en.test_models(data_test)

Example 3


In [ ]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y')
print('Training Data',Data.shape)
print('Test Data',data_test.shape)

en.metric_set('roc_auc_score')

en.set_no_of_layers(3)

#Hyper Parameter Optimisation (max_depth and eta)
param_gb_1 = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
                                                eval_metric = ['auc'], objective = ['binary:logistic'], \
                                              max_depth = [5, 10, 15], eta = [0.1, 0.3, 0.5])

#Hyper Parameter Optimisation (gamma and eta)
param_gb_2 = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
                                                eval_metric = ['auc'], objective = ['binary:logistic'], \
                                                gamma = [0, 1, 3, 5, 7], eta = [0.1, 0.3], \
                                                max_depth = [5, 10, 15], colsample_bylevel = [0.1])


#Setting max_depth, rest are default values
param_dt = en.parameter_set_decision_tree(max_depth = [6])

#Setting max_depth, n_estimators, max_features, rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - n_estimators
param_rf = en.parameter_set_random_forest(max_depth = [6, 10, 12, 15], n_estimators = [10, 20, 30], \
                                          max_features = ['log2'])

#Setting penalty, C, rest are default values
#Hyper parameter optimisation - penalty
#Hyper parameter optimisation - C
param_lor = en.parameter_set_logistic_regression(penalty = ['l1','l2'], C = [1.0, 2.0, 3.0, 5.0, 10.0])

#Setting fit_intercept, rest are default values
param_lr = en.parameter_set_linear_regression(fit_intercept = [False])

#Setting dim_layer, activation, rest are default values
#Hyper parameter optimisation : dim_layer - Layer1 and Layer 2
#Hyper parameter optimisation : activation - Layer1 and Layer 2
param_mlp = en.parameter_set_multi_layer_perceptron(hyper_parameter_optimisation = True, \
                                                    dim_layer = [[32,64,128], [32,64], [1]], \
                                                   activation = [['sigmoid','relu'], \
                                                                 ['sigmoid'], ['sigmoid','relu']], \
                                                   optimizer = 'rmsprop')



en.train_base_models(['random_forest','multi_layer_perceptron', 'gradient_boosting', \
                      'logistic_regression','linear_regression', 'decision_tree'], \
                     [param_rf, param_mlp, param_gb_1, param_lor, param_lr, param_dt])

weights = en.assign_weights(weights = [[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],\
                                    [1,2,3,4,5,6]], hyper_parameter_optimisation = True)

#Setting penalty, rest are default values
param_lor_ens = en.parameter_set_logistic_regression(penalty = ['l2'])

#Setting max_depth, splitter, presort rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - splitter
param_dt = en.parameter_set_decision_tree(max_depth = [6, 10, 12, 15], splitter = ['best', 'random'], \
                                          presort = [True])


en.train_ensemble_models(['gradient_boosting','logistic_regression'], [param_gb,param_lor_ens],
                      ['gradient_boosting','decision_tree','logistic_regression'],[param_gb,param_dt,\
                                                                                   param_lor_ens], 
                      perform_weighted_average = True, weights_list = weights)

en.test_models(data_test)

In [ ]: