In [1]:
import pandas as pd
import ensembles as en
import numpy as np
import xgboost as xgb
import category_encoders as ce
from sklearn import datasets, linear_model, preprocessing, grid_search
from sklearn.preprocessing import Imputer, PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.regularizers import l2, activity_l2
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, accuracy_score, \
mean_absolute_error, mean_squared_error, r2_score
from sklearn.cross_validation import train_test_split
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from functools import partial
np.random.seed(1338)
In [2]:
#Training the base models
In [3]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y')
print('Training Data',Data.shape)
print('Test Data',data_test.shape)
en.metric_set('roc_auc_score')
#Hyper Parameter Optimisation (max_depth and eta)
param_gb = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
eval_metric = ['auc'], objective = ['binary:logistic'], \
max_depth = [5, 10, 15], eta = [0.1, 0.3, 0.5])
#Setting max_depth, rest are default values
param_dt = en.parameter_set_decision_tree(max_depth = [6])
en.train_base_models(['gradient_boosting','decision_tree'],[param_gb, param_dt], save_models = True)
weights = en.assign_weights(weights = 'default', hyper_parameter_optimisation = True)
#Stacking
en.train_ensemble_models(stack_model_list = ['gradient_boosting'], stack_parameters_list = [param_gb],
perform_weighted_average = True, weights_list = weights)
test_models(data_test)
In [ ]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y', encode ='binary', split = True, stratify = False, split_size = 0.1)
print('Training Data',Data.shape)
print('Test Data',data_test.shape)
en.metric_set('roc_auc_score')
#Hyper Parameter Optimisation (gamma and eta)
param_gb = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
eval_metric = ['auc'], objective = ['binary:logistic'], \
gamma = [0, 1, 3, 5, 7], eta = [0.1, 0.3], \
max_depth = [5, 10, 15])
#Setting max_depth, splitter, presort rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - splitter
param_dt_1 = en.parameter_set_decision_tree(max_depth = [6, 10, 12, 15], splitter = ['best', 'random'], \
presort = [True])
#Default Values
param_dt_2 = en.parameter_set_decision_tree()
en.train_base_models(['decision_tree','decision_tree', 'gradient_boosting'], \
[param_dt_1, param_dt_2, param_gb])
weights = en.assign_weights(weights = [[2],[1],[3]], hyper_parameter_optimisation = False)
en.train_ensemble_models(['gradient_boosting'], [param_gb],
['gradient_boosting'],[param_gb],
perform_weighted_average = True, weights_list = weights)
en.test_models(data_test)
In [ ]:
%%time
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)
data_test = en.data_import(Data, label_output='y')
print('Training Data',Data.shape)
print('Test Data',data_test.shape)
en.metric_set('roc_auc_score')
en.set_no_of_layers(3)
#Hyper Parameter Optimisation (max_depth and eta)
param_gb_1 = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
eval_metric = ['auc'], objective = ['binary:logistic'], \
max_depth = [5, 10, 15], eta = [0.1, 0.3, 0.5])
#Hyper Parameter Optimisation (gamma and eta)
param_gb_2 = en.parameter_set_gradient_boosting(hyper_parameter_optimisation = True, \
eval_metric = ['auc'], objective = ['binary:logistic'], \
gamma = [0, 1, 3, 5, 7], eta = [0.1, 0.3], \
max_depth = [5, 10, 15], colsample_bylevel = [0.1])
#Setting max_depth, rest are default values
param_dt = en.parameter_set_decision_tree(max_depth = [6])
#Setting max_depth, n_estimators, max_features, rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - n_estimators
param_rf = en.parameter_set_random_forest(max_depth = [6, 10, 12, 15], n_estimators = [10, 20, 30], \
max_features = ['log2'])
#Setting penalty, C, rest are default values
#Hyper parameter optimisation - penalty
#Hyper parameter optimisation - C
param_lor = en.parameter_set_logistic_regression(penalty = ['l1','l2'], C = [1.0, 2.0, 3.0, 5.0, 10.0])
#Setting fit_intercept, rest are default values
param_lr = en.parameter_set_linear_regression(fit_intercept = [False])
#Setting dim_layer, activation, rest are default values
#Hyper parameter optimisation : dim_layer - Layer1 and Layer 2
#Hyper parameter optimisation : activation - Layer1 and Layer 2
param_mlp = en.parameter_set_multi_layer_perceptron(hyper_parameter_optimisation = True, \
dim_layer = [[32,64,128], [32,64], [1]], \
activation = [['sigmoid','relu'], \
['sigmoid'], ['sigmoid','relu']], \
optimizer = 'rmsprop')
en.train_base_models(['random_forest','multi_layer_perceptron', 'gradient_boosting', \
'logistic_regression','linear_regression', 'decision_tree'], \
[param_rf, param_mlp, param_gb_1, param_lor, param_lr, param_dt])
weights = en.assign_weights(weights = [[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],[1,2,3,4,5,6],\
[1,2,3,4,5,6]], hyper_parameter_optimisation = True)
#Setting penalty, rest are default values
param_lor_ens = en.parameter_set_logistic_regression(penalty = ['l2'])
#Setting max_depth, splitter, presort rest are default values
#Hyper parameter optimisation - max_depth
#Hyper parameter optimisation - splitter
param_dt = en.parameter_set_decision_tree(max_depth = [6, 10, 12, 15], splitter = ['best', 'random'], \
presort = [True])
en.train_ensemble_models(['gradient_boosting','logistic_regression'], [param_gb,param_lor_ens],
['gradient_boosting','decision_tree','logistic_regression'],[param_gb,param_dt,\
param_lor_ens],
perform_weighted_average = True, weights_list = weights)
en.test_models(data_test)
In [ ]: