In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, preprocessing, grid_search
from sklearn.preprocessing import Imputer, PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.regularizers import l2, activity_l2
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials 
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
import category_encoders as ce
from functools import partial
np.random.seed(1338)


Using Theano backend.

In [2]:
def Data_import():
    
    global Data
    #Reading the data, into a Data Frame.
    Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)

    #Selcting the columns of string data type
    names = Data.select_dtypes(include = ['object'])
    
    #Converting string categorical variables to integer categorical variables.
    label_encode(names.columns.tolist())
    
    global columns
    columns = names.drop(['y'],axis=1).columns.tolist()

Label Encoding


In [3]:
#Function that encodes the string values to numerical values.
def label_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values.
    encoder = ce.OrdinalEncoder(verbose=1, cols=column_names)
    Data = encoder.fit_transform(Data)

One Hot Encoding


In [4]:
#Encoding the data, encoding the string values into numerical values, using one hot encoding method.
def perform_one_hot_encoding():
    
    global Data
    #Finding the one hot encoding for all columns excpet the target label column.
    for column in columns:
    
        #Converting the data frame column to an array
        column_list = np.array(Data[column].tolist())
        #Converting the array to a nested list for the one hot encode transformation
        column_list_of_list = np.reshape(column_list,(-1,1))
    
        #Storing the one hot encode data frame
        data_frame_one_hot = one_hot_encode(column_list_of_list,column)
        
        #Dropping the original column and then replacing it with the one hot encoded data frame.
        Data = Data.drop([column],axis=1)
        Data = pd.concat([Data, data_frame_one_hot], axis = 1)

In [5]:
def one_hot_encode(X,column_name):
    
    enc = OneHotEncoder()
    X = enc.fit_transform(X).toarray()
    #Converting X to a dataframe.
    X = pd.DataFrame(X)
    #Assigning column names to the one hot encoded columns.
    X = X.rename(columns=lambda x: column_name + str(x))
    return X

In [6]:
#def one_hot_encode(column_names):
    
    #global Data
    #Encoding the data, encoding the string values into numerical values, using one-hot method
    #encoder = ce.OneHotEncoder(cols=column_names)
    #Data = encoder.fit_transform(Data)

Binary Encoding


In [7]:
def binary_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using binary method.
    encoder = ce.BinaryEncoder(cols=column_names,verbose=1)
    Data = encoder.fit_transform(Data)

Hashing Encoding


In [8]:
def hashing_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using hashing method.
    encoder = ce.HashingEncoder(verbose=1, n_components=128, cols=column_names)
    Data = encoder.fit_transform(Data)

Backward Difference Encoder


In [9]:
def backward_difference_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using backward difference method.
    encoder = ce.BackwardDifferenceEncoder(verbose=1, cols=column_names)
    Data = encoder.fit_transform(Data)

Helmert Encoding


In [10]:
def helmert_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using helmert method.
    encoder = ce.HelmertEncoder(verbose=1, cols=column_names)
    Data = encoder.fit_transform(Data)

Sum Encoding


In [11]:
def sum_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using sum method.
    encoder = ce.SumEncoder(verbose=1, cols=column_names)
    Data = encoder.fit_transform(Data)

Polynomial Encode


In [12]:
def polynomial_encode(column_names):
    
    global Data
    #Encoding the data, encoding the string values into numerical values, using polynomial method.
    encoder = ce.PolynomialEncoder(verbose=1, cols=column_names)
    Data = encoder.fit_transform(Data)

Sample Generation


In [13]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_one_hot_encode(n):
    
    for i in range(n):
        Data_import()
        perform_one_hot_encoding()
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [14]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_label_encode(n):
    
    for i in range(n):
        Data_import()
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [15]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_hashing_encode(n):
    
    for i in range(n):
        Data_import()
        hashing_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [16]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_backward_difference_encode(n):
    
    for i in range(n):
        Data_import()
        backward_difference_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [17]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_helmert_encode(n):
    
    for i in range(n):
        Data_import()
        helmert_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [18]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_binary_encode(n):
    
    for i in range(n):
        Data_import()
        binary_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [19]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_sum_encode(n):
    
    for i in range(n):
        Data_import()
        sum_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [20]:
#Performing training, cross validation and testing on different stratified splits of the data.
def sample_generation_polynomial_encode(n):
    
    for i in range(n):
        Data_import()
        polynomial_encode(columns)
        data_initialize()
        data_split()
        metric_initialize()
        train_cross_val_base_models()
        print_metric_cross_val(i)
        train_second_level_models()
        metric_initialize()
        test_data()
        print_metric_test(i)

In [21]:
#Splitting the data into training and testing datasets (Stratified Split)
def data_split():
    

    global Data
    global test
    Data, test = train_test_split(Data, test_size = 0.1, stratify = Data['y'],random_state = 0)

In [22]:
#This function is used to convert the predictions of the base models into a DataFrame.
def build_data_frame(data):
    
    data_frame = pd.DataFrame(data).T
    return data_frame

In [23]:
def data_initialize():
    
    #Initializing the test dataset.
    test = pd.DataFrame()
    
    global stack_X
    global stack_Y
    
    #Initializing two data frames that will be used as training data for the stacked model.
    #The data frame will contain the predictions of the base models.
    stack_X = pd.DataFrame() 
    #The data frame will contain the calss labels of the base models.
    stack_Y = pd.DataFrame() 
    
    global blend_X
    global raw_features_X
    
    #Initializing two data frames that will be used as training data for the blending model.
    #The data frames will contain the predictions and raw features  of the base models.
    blend_X = pd.DataFrame() 
    #The data frames will contain the raw features  of the data, which will be concatenated with the predictions.
    raw_features_X = pd.DataFrame() 
    
    global test_blend_X
    global test_raw_features_X
    global test_stack_X 
    global test_stack_Y
    
    #Initializing the dataframes that will be used for testing the stacking and blending models.
    test_blend_X = pd.DataFrame()
    test_raw_features_X = pd.DataFrame()
    test_stack_X = pd.DataFrame()
    test_stack_Y = pd.DataFrame()

Gradient Boosting (XGBoost)


In [24]:
#Defining the parameters for the XGBoost (Gradient Boosting) Algorithm.
def param_set_XGBoost():
    
    #Gradient Boosting (XGBoost)
    param = {}
    #Setting Parameters for the Booster which will be optimized later using hyperopt.
    param['booster'] = ['gbtree','gblinear']
    param['objective'] = ['binary:logistic']
    param["eval_metric"] = ["auc"]
    param['eta'] = [0.1,0.3,0.5,0.7,0.9,1]
    param['gamma'] = [0,1,5,10,15]
    param['max_depth'] = [3,6,9,12,15,20]
    param['min_child_weight'] = [1,5,10]
    param['max_delta_step'] = [0,1,5,10]
    param['subsample'] = [0.5,1]
    param['colsample_bytree'] = [0.5,1]
    param['silent'] = [1]
    param['seed'] = [0]
    param['base_score'] = [0.5]
    param['lambda_bias'] = [1,5,10]
    param['lambda'] = [0,0.1,0.5,1,10]
    
    return param

In [25]:
#Assigning the values of the XGBoost parameters that need to be checked, for minimizing the objective (loss).
#The values that give the most optimum results will be picked to train the model.
def assign_space_gradient_boosting():
    
    parameter_gradient_boosting = param_set_XGBoost()
    space_gradient_boosting ={
        
        'booster': hp.choice('booster', parameter_gradient_boosting['booster']),
        
        'eta': hp.choice('eta', parameter_gradient_boosting['eta']),
        
        'gamma': hp.choice('gamma', parameter_gradient_boosting['gamma']),
        
        'max_depth': hp.choice('max_depth', parameter_gradient_boosting['max_depth']),
        
        'min_child_weight': hp.choice('min_child_weight', parameter_gradient_boosting['min_child_weight']),
        
        'max_delta_step': hp.choice('max_delta_step', parameter_gradient_boosting['max_delta_step']),
        
        'subsample': hp.choice('subsample', parameter_gradient_boosting['subsample']),
        
        'colsample_bytree': hp.choice('colsample_bytree', parameter_gradient_boosting['colsample_bytree']),
        
        'silent': hp.choice('silent', parameter_gradient_boosting['silent']),
        
        'seed': hp.choice('seed', parameter_gradient_boosting['seed']),
        
        'base_score': hp.choice('base_score', parameter_gradient_boosting['base_score']),
        
        'lambda_bias': hp.choice('lambda_bias', parameter_gradient_boosting['lambda_bias']),
        
        'lambda': hp.choice('lambda', parameter_gradient_boosting['lambda'])
        
    }
    
    return space_gradient_boosting

In [26]:
#This function calculates the loss for different parameter values and is used to determine the most optimum 
#parameter values
def objective_gradient_boosting(space_gradient_boosting,a,b):
    print(a.shape)
    print(space_gradient_boosting)
    #Gradient Boosting (XGBoost)
    param = {}
    #Setting Parameters for the Booster
    param['booster'] = space_gradient_boosting['booster']
    param['objective'] = 'binary:logistic'
    param["eval_metric"] = "auc"
    param['eta'] = space_gradient_boosting['eta']
    param['gamma'] = space_gradient_boosting['gamma']
    param['max_depth'] = space_gradient_boosting['max_depth']
    param['min_child_weight'] = space_gradient_boosting['min_child_weight']
    param['max_delta_step'] = space_gradient_boosting['max_delta_step']
    param['subsample'] = space_gradient_boosting['subsample']
    param['colsample_bytree'] = space_gradient_boosting['colsample_bytree']
    param['silent'] = space_gradient_boosting['silent']
    param['seed'] = space_gradient_boosting['seed']
    param['base_score'] = space_gradient_boosting['base_score']
    param['lambda_bias'] = space_gradient_boosting['lambda_bias']
    param['lambda'] = space_gradient_boosting['lambda']
    
    model = xgb.Booster()
    auc_list = list()
    
    #Declared train_X as a global variable, unable to pass it as a parameter
    #Performing cross validation.
    skf=StratifiedKFold(train_Y, n_folds=3,random_state=0)
    for train_index, cross_val_index in skf:
        
        xgb_train_X, xgb_cross_val_X = train_X.iloc[train_index],train_X.iloc[cross_val_index]
        xgb_train_Y, xgb_cross_val_Y = train_Y.iloc[train_index],train_Y.iloc[cross_val_index]
        
        dtrain = xgb.DMatrix(xgb_train_X, label = xgb_train_Y)
        model = xgb.train(param, dtrain)
        
        predict = model.predict(xgb.DMatrix(xgb_cross_val_X, label = xgb_cross_val_Y))
        auc_list.append(roc_auc_score(xgb_cross_val_Y,predict))
    
    #Calculating the AUC and returning the loss, which will be minimised by selecting the optimum parameters.
    auc = np.mean(auc_list)
    return{'loss':1-auc, 'status': STATUS_OK }

In [27]:
#Using the loss values, this function picks the optimum parameter values. These values will be used 
#for training the model
def gradient_boosting_parameters(train_X,train_Y,obj):
    
    print(train_X.shape)
    space_gradient_boosting = assign_space_gradient_boosting()
    trials = Trials()
    
    best = fmin(fn = partial(obj, a=train_X, b=4),
    space = space_gradient_boosting,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials)
    
    parameter_gradient_boosting = param_set_XGBoost()
    optimal_param={}
    #Best is a dictionary that contains the indices of the optimal parameter values.
    #The following for loop uses these indices to obtain the parameter values, these values are stored in a
    #dictionary - optimal_param
    for key in best:
        optimal_param[key] = parameter_gradient_boosting[key][best[key]]
        
    #optimal_param['booster'] = 'gbtree'
    optimal_param['objective'] = 'binary:logistic'
    optimal_param["eval_metric"] = "auc"
    
    #Training the model with the optimal parameter values
    dtrain = xgb.DMatrix(train_X, label = train_Y)
    model = xgb.train(optimal_param, dtrain)
    return model

In [28]:
#Trains the Gradient Boosting model.
def train_gradient_boosting(train_X,train_Y):
    
    space_gradient_boosting = assign_space_gradient_boosting()
    model = gradient_boosting_parameters(train_X,train_Y,objective_gradient_boosting)
    return model

In [29]:
def cross_val_gradient_boosting(cross_val_X,cross_val_Y):
    
    predict = gradient_boosting.predict(xgb.DMatrix(cross_val_X, label = cross_val_Y))
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

Multi Layer Perceptron


In [30]:
#Trains the Multi Layer Perceptron model.
def train_multi_layer_perceptron(train_X,train_Y):
    
    model = multi_layer_perceptron_parameters(train_X,train_Y,objective_multi_layer_perceptron)
    return model

In [31]:
def cross_val_multi_layer_perceptron(cross_val_X,cross_val_Y):

    global multi_layer_perceptron
    cross_val_X = StandardScaler().fit_transform(cross_val_X)
    
    predict = multi_layer_perceptron.predict(cross_val_X)
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [32]:
#Defining the parameters for the XGBoost (Gradient Boosting) Algorithm.
def param_set_multi_layer_perceptron():
    
    param={}
    param['dim_layer'] = [32,64]
    param['activation_layer_1'] = ['sigmoid','relu']
    param['init_layer_1'] = ['normal','uniform']
    param['activation_layer_2'] = ['sigmoid','relu']
    param['optimizer'] = ['rmsprop']
    param['dropout'] = [0.2,0.5]
    
    return param

In [33]:
#Assigning the values of the multi layer perceptron parameters that need to be checked, 
#for minimizing the objective (loss). 
#The values that give the most optimum results will be picked to train the model.
def assign_space_multi_layer_perceptron():
    
    parameter_multi_layer_perceptron = param_set_multi_layer_perceptron()
    space_multi_layer_perceptron ={
        
        'dim_layer': hp.choice('dim_layer', parameter_multi_layer_perceptron['dim_layer']),
        
        'activation_layer_1': hp.choice('activation_layer_1', parameter_multi_layer_perceptron['activation_layer_1']),
        
        'init_layer_1': hp.choice('init_layer_1', parameter_multi_layer_perceptron['init_layer_1']),
        
        'activation_layer_2': hp.choice('activation_layer_2', parameter_multi_layer_perceptron['activation_layer_2']),
        
        'optimizer': hp.choice('optimizer', parameter_multi_layer_perceptron['optimizer']),
        
        'dropout': hp.choice('dropout', parameter_multi_layer_perceptron['dropout'])
        
        
    }
    
    return space_multi_layer_perceptron

In [34]:
#This function calculates the loss for different parameter values and is used to determine the most optimum 
#parameter values
def objective_multi_layer_perceptron(space_multi_layer_perceptron):
    
    #Setting Parameters for the MLP model.
    dim_layer = space_multi_layer_perceptron['dim_layer']
    activation_layer_1 = space_multi_layer_perceptron['activation_layer_1']
    init_layer_1 = space_multi_layer_perceptron['init_layer_1']
    activation_layer_2 = space_multi_layer_perceptron['activation_layer_2']
    optimizer = space_multi_layer_perceptron['optimizer']
    dropout = space_multi_layer_perceptron['dropout']

    
    acc_list = list()
    
    #Declared train_X as a global variable, unable to pass it as a parameter
    #Performing cross validation.
    skf=StratifiedKFold(train_Y, n_folds = 3,random_state=0)
    for train_index, cross_val_index in skf:
        
        mlp_train_X, mlp_cross_val_X = train_X.iloc[train_index],train_X.iloc[cross_val_index]
        mlp_train_Y, mlp_cross_val_Y = train_Y.iloc[train_index],train_Y.iloc[cross_val_index]
        #mlp_train_X = mlp_train_X.as_matrix()
        mlp_train_Y = mlp_train_Y.as_matrix()
        mlp_train_X = StandardScaler().fit_transform(mlp_train_X)
        mlp_cross_val_X = StandardScaler().fit_transform(mlp_cross_val_X)

        
        model = Sequential()
        model.add(Dense(output_dim = dim_layer, input_dim = train_X.shape[1], init = init_layer_1
                        , activation = activation_layer_1))
        model.add(Dropout(dropout))
        model.add(Dense(output_dim = 1,activation = activation_layer_2))
        model.compile(optimizer = optimizer,loss = 'binary_crossentropy',metrics = ['accuracy'])
        model.fit(mlp_train_X, mlp_train_Y, nb_epoch = 2, batch_size = 128,verbose=1)

        #predict = model.predict(mlp_cross_val_X)
        #auc_list.append(roc_auc_score(mlp_cross_val_Y,predict))
        
        score = model.evaluate(mlp_cross_val_X, mlp_cross_val_Y, verbose=0)
        acc_list.append(score[1])
    
    #Calculating the AUC and returning the loss, which will be minimised by selecting the optimum parameters.
    acc = np.mean(acc_list)
    return{'loss':1-acc, 'status': STATUS_OK }

In [35]:
#Using the loss values, this function picks the optimum parameter values. These values will be used 
#for training the model
def multi_layer_perceptron_parameters(train_X,train_Y,obj):
    
    space_multi_layer_perceptron = assign_space_multi_layer_perceptron()
    trials = Trials()
    
    best = fmin(fn = obj,
    space = space_multi_layer_perceptron,
    algo = tpe.suggest,
    max_evals = 5,
    trials = trials)
    
    parameter_multi_layer_perceptron = param_set_multi_layer_perceptron()
    optimal_param={}
    
    #Best is a dictionary that contains the indices of the optimal parameter values.
    #The following for loop uses these indices to obtain the parameter values, these values are stored in a
    #dictionary - optimal_param
    for key in best:
        optimal_param[key] = parameter_multi_layer_perceptron[key][best[key]]
    
    train_X = StandardScaler().fit_transform(train_X)
    #Training the model with the optimal parameter values
    model = Sequential()
    model.add(Dense(output_dim = optimal_param['dim_layer'] , 
                    input_dim = train_X.shape[1], init = optimal_param['init_layer_1'], 
                    activation = optimal_param['activation_layer_1']))
    model.add(Dropout(optimal_param['dropout']))
    model.add(Dense(output_dim = 1,
                    activation = optimal_param['activation_layer_2']))
    model.compile(optimizer = optimal_param['optimizer'],loss = 'binary_crossentropy',metrics = ['accuracy'])
    model.fit(train_X, train_Y, nb_epoch = 15, batch_size = 128,verbose=1)
    cross = StandardScaler().fit_transform(cross_val_X)
    x = model.predict(cross)
    
    
    return model

Decision Tree


In [36]:
#Trains the Decision Tree model. Performing a grid search to select the optimal parameter values
def train_decision_tree(train_X,train_Y):
    
    pipeline_model = Pipeline([('dtc', DecisionTreeClassifier())])
    param = {'dtc__max_depth':[6,9,12,15,20],'dtc__criterion':['gini','entropy'],}       
    model_gs = grid_search.GridSearchCV(pipeline_model, param,scoring='roc_auc')
    model_gs.fit(train_X,train_Y)
    return model_gs

In [37]:
def cross_val_decision_tree(cross_val_X,cross_val_Y):
    
    global decision_tree
    predict = decision_tree.predict_proba(cross_val_X)[:,1]
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [38]:
#def decision_tree_parameters(parameters_decision_tree={}):
    
    #param = parameters_decision_tree
    #return param

Random Forest


In [39]:
#Trains the Random Forest model. Performing a grid search to select the optimal parameter values
def train_random_forest(train_X,train_Y):
    
    pipeline_model = Pipeline([('rfc', RandomForestClassifier())]) 
    param = {'rfc__max_depth':[6,9,12,15,20],'rfc__n_estimators':[5,10,15,20]}
    model_gs = grid_search.GridSearchCV(pipeline_model, param, scoring='roc_auc')
    model_gs.fit(train_X,train_Y)
    return model_gs

In [40]:
def cross_val_random_forest(cross_val_X,cross_val_Y):
    
    global random_forest
    predict = random_forest.predict_proba(cross_val_X)[:,1]
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [41]:
#def random_forest_parameters(parameters_random_forest={}):
    
    #param = parameters_random_forest
    #return param

Linear Regression


In [42]:
#Trains the Linear Regression model. Performing a grid search to select the optimal parameter values
def train_linear_regression(train_X,train_Y):
    
    pipeline_model = Pipeline([('scl', StandardScaler()),('lr', linear_model.LinearRegression())]) 
    param = {'lr__normalize':[False]}
    #train_X=StandardScaler().fit_transform(train_X)
    model_gs = grid_search.GridSearchCV(pipeline_model, param, scoring='roc_auc')
    model_gs.fit(train_X,train_Y) 
    return model_gs

In [43]:
def cross_val_linear_regression(cross_val_X,cross_val_Y):
    
    #cross_val_X = StandardScaler().fit_transform(cross_val_X)
    predict = linear_regression.predict(cross_val_X)
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [44]:
#def linear_regression_parameters(parameters_linear_regression={}):
    
    #param = parameters_linear_regression
    #return param

Losgistic Regression (L1)


In [45]:
#Trains the Logistic Regression (L2) model. Performing a grid search to select the optimal parameter values
def train_logistic_regression_L1(train_X,train_Y):

    pipeline_model = Pipeline([('scl', StandardScaler()),('l1', linear_model.LogisticRegression())]) 
    param = {'l1__penalty':['l1'],'l1__C':[0.0001,0.001,0.01,0.1,1,10,100,100]}
    train_X=StandardScaler().fit_transform(train_X)
    model_gs = grid_search.GridSearchCV(pipeline_model, param, scoring='roc_auc')
    model_gs.fit(train_X,train_Y)
    return model_gs

In [46]:
def cross_val_logistic_regression_L1(cross_val_X,cross_val_Y):
    
    cross_val_X = preprocessing.StandardScaler().fit_transform(cross_val_X)
    predict = logistic_regression_L1.predict_proba(cross_val_X)[:,1]
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [47]:
#def logistic_regression_L1_parameters(parameters_logistic_regression_L1={}):
    
    #param = parameters_logistic_regression_L1
    #return param

Logistic Regression (L2)


In [48]:
#Trains the Logistic Regression (L2) model. Performing a grid search to select the optimal parameter values
def train_logistic_regression_L2(train_X,train_Y):
    
    pipeline_model = Pipeline([('scl', StandardScaler()),('l2', linear_model.LogisticRegression())]) 
    param = {'l2__penalty':['l2'],'l2__C':[0.0001,0.001,0.01,0.1,1,10,100,100]}
    train_X=StandardScaler().fit_transform(train_X)
    model_gs = grid_search.GridSearchCV(pipeline_model, param, scoring='roc_auc')
    model_gs.fit(train_X,train_Y)
    return model_gs

In [49]:
def cross_val_logistic_regression_L2(cross_val_X,cross_val_Y):
    
    cross_val_X = preprocessing.StandardScaler().fit_transform(cross_val_X)
    predict = logistic_regression_L2.predict_proba(cross_val_X)[:,1]
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [50]:
#def logistic_regression_L2_parameters(parameters_logistic_regression_L2={}):
    
    #param = parameters_logistic_regression_L2
    #return param

Weighted Average


In [51]:
#Perfroms weighted average of the predictions of the base models.
def weighted_average(data_frame_predictions, cross_val_Y):
    
    weighted_avg_predictions=np.average(data_frame_predictions,axis=1,weights=weight)
    auc = roc_auc_score(cross_val_Y,weighted_avg_predictions)
    return [auc,weighted_avg_predictions]

In [52]:
#Defining the objective. Appropriate weights need to be calculated to minimize the loss.
def objective_weighted_average(space):
    
    global stack_X
    weighted_avg_predictions=np.average(stack_X,axis=1,weights=[space['w1'],space['w2'],space['w3']
                                                                              ,space['w4'],space['w5'],
                                                                              space['w6'],space['w7']])
    
    
    global stack_Y
    auc = roc_auc_score(stack_Y,weighted_avg_predictions)
    return{'loss':1-auc, 'status': STATUS_OK }

In [53]:
#Assigning the weights that need to be checked, for minimizing the objective (Loss)
def assign_space_weighted_average():
    
    space ={
        'w1': hp.choice("x_w1", range(10)),
        'w2': hp.choice('x_w2', range(10)),
        'w3': hp.choice('x_w3', range(10)),
        'w4': hp.choice('x_w4', range(10)),
        'w5': hp.choice('x_w5', range(10)),
        'w6': hp.choice('x_w6', range(10)),
        'w7': hp.choice('x_w7', range(10))
    }
    print(space)
    
    return space

In [54]:
#Function that finds the best possible combination of weights for performing the weighted predictions.
def get_weights():
    
    space = assign_space_weighted_average()
    trials = Trials()
    
    best = fmin(fn = objective_weighted_average,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials)
    best_weights = list()
    
    #Arranging the weights in order of the respective models, and then returning the list of weights.
    for key in sorted(best):
        best_weights.append(best[key])
    
    return best_weights

Stacking


In [55]:
#Trains the Stacking model (Gradient Boosting - XGBoost)
def train_stack_model(train_X,train_Y):
    
    model = gradient_boosting_parameters(train_X,train_Y,objective_stack)
    return model

In [56]:
def cross_val_stack(cross_val_X,cross_val_Y):

    predict = stack.predict(xgb.DMatrix(cross_val_X,label = cross_val_Y))
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [57]:
#This function calculates the loss for different parameter values and is used to determine the most optimum 
#parameter values
def objective_stack(space_gradient_boosting):
    
    #Gradient Boosting (XGBoost)
    param = {}
    #Setting Parameters for the Booster
    param['booster'] = space_gradient_boosting['booster']
    param['objective'] = 'binary:logistic'
    param["eval_metric"] = "auc"
    param['eta'] = space_gradient_boosting['eta']
    param['gamma'] = space_gradient_boosting['gamma']
    param['max_depth'] = space_gradient_boosting['max_depth']
    param['min_child_weight'] = space_gradient_boosting['min_child_weight']
    param['max_delta_step'] = space_gradient_boosting['max_delta_step']
    param['subsample'] = space_gradient_boosting['subsample']
    param['colsample_bytree'] = space_gradient_boosting['colsample_bytree']
    param['silent'] = space_gradient_boosting['silent']
    param['seed'] = space_gradient_boosting['seed']
    param['base_score'] = space_gradient_boosting['base_score']
    param['lambda_bias'] = space_gradient_boosting['lambda_bias']
    param['lambda'] = space_gradient_boosting['lambda']
    
    model = xgb.Booster()
    auc_list = list()
    
    #Declared train_X as a global variable, unable to pass it as a parameter
    #Performing cross validation.
    skf=StratifiedKFold(stack_Y, n_folds=3,random_state=0)
    for train_index, cross_val_index in skf:
        
        xgb_train_X, xgb_cross_val_X = stack_X.iloc[train_index],stack_X.iloc[cross_val_index]
        xgb_train_Y, xgb_cross_val_Y = stack_Y.iloc[train_index],stack_Y.iloc[cross_val_index]
        
        dtrain = xgb.DMatrix(xgb_train_X, label = xgb_train_Y)
        model = xgb.train(param, dtrain)
        
        predict = model.predict(xgb.DMatrix(xgb_cross_val_X, label = xgb_cross_val_Y))
        auc_list.append(roc_auc_score(xgb_cross_val_Y,predict))
    
    #Calculating the AUC and returning the loss, which will be minimised by selecting the optimum parameters.
    auc = np.mean(auc_list)
    return{'loss':1-auc, 'status': STATUS_OK }

Blending


In [58]:
#Trains the blending model (Gradient Boosting - XGBoost)
def train_blend_model(train_X,train_Y): 
    
    model = gradient_boosting_parameters(train_X,train_Y,objective_blend)
    return model

In [59]:
def cross_val_blend(cross_val_X,cross_val_Y):

    predict = blend.predict(xgb.DMatrix(cross_val_X,label = cross_val_Y))
    auc = roc_auc_score(cross_val_Y,predict)
    return [auc,predict]

In [60]:
#This function calculates the loss for different parameter values and is used to determine the most optimum 
#parameter values
def objective_blend(space_gradient_boosting):
    
    #Gradient Boosting (XGBoost)
    param = {}
    #Setting Parameters for the Booster
    param['booster'] = space_gradient_boosting['booster']
    param['objective'] = 'binary:logistic'
    param["eval_metric"] = "auc"
    param['eta'] = space_gradient_boosting['eta']
    param['gamma'] = space_gradient_boosting['gamma']
    param['max_depth'] = space_gradient_boosting['max_depth']
    param['min_child_weight'] = space_gradient_boosting['min_child_weight']
    param['max_delta_step'] = space_gradient_boosting['max_delta_step']
    param['subsample'] = space_gradient_boosting['subsample']
    param['colsample_bytree'] = space_gradient_boosting['colsample_bytree']
    param['silent'] = space_gradient_boosting['silent']
    param['seed'] = space_gradient_boosting['seed']
    param['base_score'] = space_gradient_boosting['base_score']
    param['lambda_bias'] = space_gradient_boosting['lambda_bias']
    param['lambda'] = space_gradient_boosting['lambda']
    
    model = xgb.Booster()
    auc_list = list()
    
    #Declared train_X as a global variable, unable to pass it as a parameter
    #Performing cross validation.
    skf=StratifiedKFold(stack_Y, n_folds=3,random_state=0)
    for train_index, cross_val_index in skf:
        
        xgb_train_X, xgb_cross_val_X = blend_X.iloc[train_index],blend_X.iloc[cross_val_index]
        xgb_train_Y, xgb_cross_val_Y = stack_Y.iloc[train_index],stack_Y.iloc[cross_val_index]
        
        dtrain = xgb.DMatrix(xgb_train_X, label = xgb_train_Y)
        model = xgb.train(param, dtrain)
        
        predict = model.predict(xgb.DMatrix(xgb_cross_val_X, label = xgb_cross_val_Y))
        auc_list.append(roc_auc_score(xgb_cross_val_Y,predict))
    
    #Calculating the AUC and returning the loss, which will be minimised by selecting the optimum parameters.
    auc = np.mean(auc_list)
    return{'loss':1-auc, 'status': STATUS_OK }

In [61]:
def metric_initialize():
    
    global metric_linear_regression
    global metric_logistic_regression_L2
    global metric_logistic_regression_L1
    global metric_decision_tree
    global metric_random_forest
    global metric_gradient_boosting
    global metric_multi_layer_perceptron
    global metric_stacking
    global metric_blending
    global metric_weighted_average
    
    #Initialzing the variables that will be used to calculate the area under the curve on the given data.
    metric_linear_regression = list()
    metric_logistic_regression_L2 = list()
    metric_logistic_regression_L1 = list()
    metric_decision_tree = list()
    metric_random_forest = list()
    metric_multi_layer_perceptron = list()
    metric_gradient_boosting = list()
    metric_weighted_average = list()
    metric_stacking = list()
    metric_blending = list()

In [62]:
#The list of base model functions (Training).
train_base_model_list = [train_gradient_boosting,train_multi_layer_perceptron,train_decision_tree,train_random_forest,
                 train_linear_regression,train_logistic_regression_L1,train_logistic_regression_L2]

#The list of base model functions (Cross Validation).
cross_val_base_model_list = [cross_val_gradient_boosting,cross_val_multi_layer_perceptron,cross_val_decision_tree
                           ,cross_val_random_forest,cross_val_linear_regression,cross_val_logistic_regression_L1
                           ,cross_val_logistic_regression_L2]

#The list of second level model functions.
cross_val_second_level_model = [cross_val_stack,cross_val_blend,weighted_average]

Base Model Predictions


In [63]:
def train_cross_val_base_models():
    
    #Cross Validation using Stratified K Fold
    train, cross_val = train_test_split(Data, test_size = 0.5, stratify = Data['y'],random_state=0)
    
    #Training the base models, and calculating AUC on the cross validation data.
    #Selecting the data (Traing Data & Cross Validation Data)
    global train_X
    global train_Y
    train_Y = train['y']
    train_X = train.drop(['y'],axis=1)
    
    global cross_val_X
    global cross_val_Y
    cross_val_Y = cross_val['y']
    cross_val_X = cross_val.drop(['y'],axis=1)
    
    global gradient_boosting
    global multi_layer_perceptron
    global decision_tree
    global random_forest
    global linear_regression
    global logistic_regression_L1
    global logistic_regression_L2

    #Training the base models parallely, the resulting models are stored which will be used for cross validation.
    [gradient_boosting,multi_layer_perceptron,decision_tree,random_forest,linear_regression,logistic_regression_L1
     ,logistic_regression_L2] = (Parallel(n_jobs = -1)(delayed(function)(train_X, train_Y)\
                                                   for function in train_base_model_list))
    
    #Computing the AUC and Predictions of all the base models on the cross validation data parallely.
    auc_predict_cross_val = (Parallel(n_jobs = -1)(delayed(function)(cross_val_X,cross_val_Y)\
                                               for function in cross_val_base_model_list))
    
    #Gradient Boosting (XGBoost)
    #The AUC error (Cross Validation Data)
    auc,predict_gradient_boosting = auc_predict_cross_val[0][0],auc_predict_cross_val[0][1]
    metric_gradient_boosting.append(auc)
    
    #Multi Layer Perceptron
    #The AUC (Cross Validation Data)
    predict_mlp=list()
    auc,predict_multi_layer_perceptron = auc_predict_cross_val[1][0],auc_predict_cross_val[1][1]
    metric_multi_layer_perceptron.append(auc)
    
    #predict_multi_layer_perceptron returns a list of lists containing the predictions,
    #this cannot be converted to a dataframe.
    #This inner lists are converted to floats and then used to convert it to a dataframe.
    for i in predict_multi_layer_perceptron:
        predict_mlp.append(float(i))
    
    #Decision Tree)
    #The AUC (Cross Validation Data)
    auc,predict_decision_tree = auc_predict_cross_val[2][0],auc_predict_cross_val[2][1]
    metric_decision_tree.append(auc)
    
    #Random Forest (Deafult=10 Trees)
    #The AUC (Cross Validation Data)
    auc,predict_random_forest = auc_predict_cross_val[3][0],auc_predict_cross_val[3][1]
    metric_random_forest.append(auc)
    
    #Linear Regression
    #The AUC (Cross Validation Data)
    auc,predict_linear_regression = auc_predict_cross_val[4][0],auc_predict_cross_val[4][1]
    metric_linear_regression.append(auc)
    
    #Logistic Regression (Default=l2)
    #The AUC (Cross Validation Data)
    auc,predict_logistic_regression_L1 = auc_predict_cross_val[5][0],auc_predict_cross_val[5][1]
    metric_logistic_regression_L1.append(auc)
    
    #Logistic Regression-L2
    #The AUC (Cross Validation Data)
    auc,predict_logistic_regression_L2 = auc_predict_cross_val[6][0],auc_predict_cross_val[6][1]
    metric_logistic_regression_L2.append(auc)
    
    #Building a list that contains all the predictions of the base models.
    predict_list = [predict_gradient_boosting,predict_decision_tree,predict_random_forest, 
                               predict_linear_regression,predict_logistic_regression_L2,
                               predict_logistic_regression_L1,predict_mlp]
    
    #Converting the above list of predictions into a dataframe, which will be used to train the stacking model.
    global stack_X
    stack_X = stack_X.append(build_data_frame(predict_list))
    
    #Building a list that contains all the raw features, used as cross validation data for the base models.
    global raw_features_X
    raw_features_X = raw_features_X.append(cross_val_X,ignore_index=True)
    
    #Storing the cross validation dataset labels in the variable stack_Y, 
    #which will be used later to train the stacking and blending models.
    global stack_Y
    stack_Y = cross_val_Y

In [64]:
def print_metric_cross_val(n):
    
    #Calculating the average AUC across all the AUC computed on the cross validation folds.
    avg_linear_regression = np.mean(metric_linear_regression)
    avg_logistic_regression_L2 = np.mean(metric_logistic_regression_L2)
    avg_logistic_regression_L1 = np.mean(metric_logistic_regression_L1)
    avg_decision_tree = np.mean(metric_decision_tree)
    avg_random_forest = np.mean(metric_random_forest)
    avg_multi_layer_perceptron = np.mean(metric_multi_layer_perceptron)
    avg_gradient_boosting = np.mean(metric_gradient_boosting)
    
    #Printing the AUC for the base models.
    print('\nStart Cross Validation Sample',n,'\n')
    print (' AUC (Linear Regression)\n',avg_linear_regression)
    print (' AUC (Logistic Regression - L2)\n',avg_logistic_regression_L2)
    print (' AUC (Logistic Regression - L1)\n',avg_logistic_regression_L1)
    print (' AUC (Decision Tree)\n',avg_decision_tree)
    print (' AUC (Random Forest)\n',avg_random_forest)
    print (' AUC (Multi Layer Perceptron)\n',avg_multi_layer_perceptron)
    print (' AUC (Gradient Boosting - XGBoost)\n',avg_gradient_boosting)
    print('\nEnd Cross Validation Sample',n,'\n')

In [65]:
#Running the second level models parallely
def train_second_level_models():
    
    #Performing a weighted average of all the base models and calculating the resulting AUC.
    global weight
    weight = get_weights()
    
    #Converting the above list of predictions and raw features (Concatenate) into a dataframe, 
    #which will be used to train the blending model.
    global blend_X
    blend_X = pd.concat([raw_features_X, stack_X], axis = 1,ignore_index = True)
    
    #Training the Stacking and Blending models parallely using the 
    #predictions of base models on the cross validation data.
    global stack
    global blend
    function_param = [(train_stack_model,stack_X,stack_Y),(train_blend_model,blend_X,stack_Y),
                      (weighted_average,stack_X,stack_Y)]
    [stack,blend,[auc,predict_weighted_average]] = Parallel(n_jobs = -1)(delayed(model_function)(train_X,train_Y)\
                                        for model_function,train_X,train_Y in function_param)
    
    #Calculating and printing the AUC for the weighted average models.
    metric_weighted_average.append(auc)
    print (' AUC (Weighted Average)\n',metric_weighted_average)

In [66]:
def print_metric_test(n):
    
    print('\nStart Test Sample',n,'\n')
    #Printing the AUC for all the models. (Test Data)
    print (' AUC (Linear Regression)\n',metric_linear_regression)
    print (' AUC (Logistic Regression - L2)\n',metric_logistic_regression_L2)
    print (' AUC (Logistic Regression - L1)\n',metric_logistic_regression_L1)
    print (' AUC (Decision Tree)\n',metric_decision_tree)
    print (' AUC (Random Forest)\n',metric_random_forest)
    print (' AUC (Multi Layer Perceptron)\n',metric_multi_layer_perceptron)
    print (' AUC (Weighted Average)\n',metric_weighted_average)
    print (' AUC (Gradient Boosting - XGBoost)\n',metric_gradient_boosting)
    print (' AUC (Stacking)\n',metric_stacking)
    print (' AUC (Blending)\n',metric_blending)
    print('\nEnd Test Sample',n,'\n')

Testing the Base and Second Level Models on the Test Dataset


In [67]:
def test_data():
    
    #Training the base models, and calculating AUC on the test data.
    #Selecting the data (Test Data)
    test_Y = test['y']
    test_X = test.drop(['y'],axis=1)
    
    #Computing the AUC and Predictions of all the base models on the test data parallely.
    auc_predict_test = (Parallel(n_jobs = -1)(delayed(function)(test_X,test_Y)\
                                          for function in cross_val_base_model_list))
    
    #Gradient Boosting (XGBoost)
    #The AUC error (Test Data)
    auc,predict_gradient_boosting = auc_predict_test[0][0],auc_predict_test[0][1]
    metric_gradient_boosting.append(auc)
    
    #Multi Layer Perceptron
    #The AUC (Test Data)
    predict_mlp=list()
    auc,predict_multi_layer_perceptron = auc_predict_test[1][0],auc_predict_test[1][1]
    metric_multi_layer_perceptron.append(auc)
    
    #predict_multi_layer_perceptron returns a list of lists containing the predictions, 
    #this cannot be converted to a dataframe.
    #This inner lists are converted to floats and then used to convert it to a dataframe.
    for i in predict_multi_layer_perceptron:
        predict_mlp.append(float(i))
    
    #Decision Tree)
    #The AUC (Test Data)
    auc,predict_decision_tree = auc_predict_test[2][0],auc_predict_test[2][1]
    metric_decision_tree.append(auc)
    
    #Random Forest (Deafult=10 Trees)
    #The AUC (Test Data)
    auc,predict_random_forest = auc_predict_test[3][0],auc_predict_test[3][1]
    metric_random_forest.append(auc)
    
    #Linear Regression
    #The AUC (Test Data)
    auc,predict_linear_regression = auc_predict_test[4][0],auc_predict_test[4][1]
    metric_linear_regression.append(auc)
    
    #Logistic Regression (Default=l2)
    #The AUC (Test Data)
    auc,predict_logistic_regression_L1 = auc_predict_test[5][0],auc_predict_test[5][1]
    metric_logistic_regression_L1.append(auc)
    
    #Logistic Regression-L2
    #The AUC (Test Data)
    auc,predict_logistic_regression_L2 = auc_predict_test[6][0],auc_predict_test[6][1]
    metric_logistic_regression_L2.append(auc)
    
    #Building a list that contains all the predictions of the base models.
    predict_list = [predict_gradient_boosting,predict_decision_tree,predict_random_forest, 
                               predict_linear_regression,predict_logistic_regression_L2,
                               predict_logistic_regression_L1,predict_mlp]
    global test_stack_X
    global test_raw_features_X
    global test_blend_X
    
    test_stack_X = build_data_frame(predict_list)
    #Converting the list of predictions into a dataframe.
    test_raw_features_X = test_raw_features_X.append(test_X,ignore_index = True)
    #Converting the above list of predictions and raw features (Concatenate) into a dataframe
    test_blend_X = pd.concat([test_raw_features_X, test_stack_X], axis = 1,ignore_index = True)

        
    #Computing the AUC and Predictions of the Stacking and Blending models on the test data parallely.
    auc_predict_test_second_level = Parallel(n_jobs = -1)(delayed(function)(test_X, test_Y) \
                                                      for function,test_X in \
                                                      ((cross_val_second_level_model[0],test_stack_X),\
                                                       (cross_val_second_level_model[1],test_blend_X),\
                                                       (cross_val_second_level_model[2],test_stack_X)))

    #Stacking (XGBoost - Gradient Boosting)
    auc,predict_stack = auc_predict_test_second_level[0][0],auc_predict_test_second_level[0][1]
    metric_stacking.append(auc)    

    #Blending (XGBoost - Gradient Boosting)
    auc,predict_blend = auc_predict_test_second_level[1][0],auc_predict_test_second_level[1][1]
    metric_blending.append(auc)
    
    #Performing a weighted average of all the base models and calculating the resulting AUC.
    auc,predict_weighted_average = auc_predict_test_second_level[2][0],auc_predict_test_second_level[2][1]
    metric_weighted_average.append(auc)

In [68]:
print('ONE HOT ENCODING\n')
%timeit -n1 -r1 sample_generation_one_hot_encode(1)
print('\nEND\n')


ONE HOT ENCODING

(18534, 63)
(18534, 63)
{'lambda': 0.5, 'min_child_weight': 5, 'base_score': 0.5, 'booster': 'gbtree', 'gamma': 0, 'max_delta_step': 1, 'lambda_bias': 5, 'seed': 0, 'subsample': 1, 'max_depth': 3, 'silent': 1, 'colsample_bytree': 1, 'eta': 0.3}
(18534, 63)
{'lambda': 0, 'min_child_weight': 5, 'base_score': 0.5, 'booster': 'gbtree', 'gamma': 10, 'max_delta_step': 5, 'lambda_bias': 5, 'seed': 0, 'subsample': 1, 'max_depth': 12, 'silent': 1, 'colsample_bytree': 1, 'eta': 0.9}
(18534, 63)
{'lambda': 0, 'min_child_weight': 1, 'base_score': 0.5, 'booster': 'gbtree', 'gamma': 1, 'max_delta_step': 5, 'lambda_bias': 10, 'seed': 0, 'subsample': 0.5, 'max_depth': 15, 'silent': 1, 'colsample_bytree': 1, 'eta': 0.3}
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-68-59350e8d362d> in <module>()
      1 print('ONE HOT ENCODING\n')
----> 2 get_ipython().magic('timeit -n1 -r1 sample_generation_one_hot_encode(1)')
      3 print('\nEND\n')

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in magic(self, arg_s)
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
   2164 
   2165     #-------------------------------------------------------------------------

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_line_magic(self, magic_name, line)
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
   2085             return result
   2086 

<decorator-gen-59> in timeit(self, line, cell)

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/magics/execution.py in timeit(self, line, cell)
   1044                     break
   1045                 number *= 10
-> 1046         all_runs = timer.repeat(repeat, number)
   1047         best = min(all_runs) / number
   1048 

/home/prajwal/anaconda3/lib/python3.5/timeit.py in repeat(self, repeat, number)
    204         r = []
    205         for i in range(repeat):
--> 206             t = self.timeit(number)
    207             r.append(t)
    208         return r

/home/prajwal/anaconda3/lib/python3.5/site-packages/IPython/core/magics/execution.py in timeit(self, number)
    135         gc.disable()
    136         try:
--> 137             timing = self.inner(it, self.timer)
    138         finally:
    139             if gcold:

<magic-timeit> in inner(_it, _timer)

<ipython-input-13-bf78f3ec33f8> in sample_generation_one_hot_encode(n)
      8         data_split()
      9         metric_initialize()
---> 10         train_cross_val_base_models()
     11         print_metric_cross_val(i)
     12         train_second_level_models()

<ipython-input-63-5f12de2c05f0> in train_cross_val_base_models()
     27     [gradient_boosting,multi_layer_perceptron,decision_tree,random_forest,linear_regression,logistic_regression_L1
     28      ,logistic_regression_L2] = (Parallel(n_jobs = -1)(delayed(function)(train_X, train_Y)\
---> 29                                                    for function in train_base_model_list))
     30 
     31     #Computing the AUC and Predictions of all the base models on the cross validation data parallely.

/home/prajwal/anaconda3/lib/python3.5/site-packages/joblib-0.9.4-py3.5.egg/joblib/parallel.py in __call__(self, iterable)
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time

/home/prajwal/anaconda3/lib/python3.5/site-packages/joblib-0.9.4-py3.5.egg/joblib/parallel.py in retrieve(self)
    755                     # a working pool as they expect.
    756                     self._initialize_pool()
--> 757                 raise exception
    758 
    759     def __call__(self, iterable):

/home/prajwal/anaconda3/lib/python3.5/site-packages/joblib-0.9.4-py3.5.egg/joblib/parallel.py in retrieve(self)
    725                 job = self._jobs.pop(0)
    726             try:
--> 727                 self._output.extend(job.get())
    728             except tuple(self.exceptions) as exception:
    729                 # Stop dispatching any new job in the async callback thread

/home/prajwal/anaconda3/lib/python3.5/multiprocessing/pool.py in get(self, timeout)
    600 
    601     def get(self, timeout=None):
--> 602         self.wait(timeout)
    603         if not self.ready():
    604             raise TimeoutError

/home/prajwal/anaconda3/lib/python3.5/multiprocessing/pool.py in wait(self, timeout)
    597 
    598     def wait(self, timeout=None):
--> 599         self._event.wait(timeout)
    600 
    601     def get(self, timeout=None):

/home/prajwal/anaconda3/lib/python3.5/threading.py in wait(self, timeout)
    547             signaled = self._flag
    548             if not signaled:
--> 549                 signaled = self._cond.wait(timeout)
    550             return signaled
    551 

/home/prajwal/anaconda3/lib/python3.5/threading.py in wait(self, timeout)
    291         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    292             if timeout is None:
--> 293                 waiter.acquire()
    294                 gotit = True
    295             else:

KeyboardInterrupt: 

In [ ]:
print('LABEL ENCODING\n')
%timeit -n1 -r1 sample_generation_label_encode(1)
print('\nEND\n')

In [ ]:
print('BINARY ENCODING\n')
%timeit -n1 -r1 sample_generation_binary_encode(1)
print('\nEND\n')

In [ ]:
print('HASHING ENCODING\n')
%timeit -n1 -r1 sample_generation_hashing_encode(1)
print('\nEND\n')

In [ ]:
print('BACKWARD DIFFERENCE ENCODING\n')
%timeit -n1 -r1 sample_generation_backward_difference_encode(1)
print('\nEND\n')

In [ ]:
print('HELMERT ENCODING\n')
%timeit -n1 -r1 sample_generation_helmert_encode(1)
print('\nEND\n')

In [ ]:
print('SUM ENCODING\n')
%timeit -n1 -r1 sample_generation_sum_encode(1)
print('\nEND\n')

In [ ]:
print('POLYNOMIAL ENCODING\n')
%timeit -n1 -r1 sample_generation_polynomial_encode(1)
print('\nEND\n')

In [ ]:
#(Parallel(n_jobs=-1)(delayed(sample_generation)(n) for n in range(4)))

In [ ]: