In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, preprocessing
from sklearn.preprocessing import Imputer, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, KFold
from keras.layers import Dense, Activation, LSTM
from keras.models import Sequential
from keras.regularizers import l2, activity_l2
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split
from joblib import Parallel, delayed


Using Theano backend.

In [2]:
#Reading the data, into a Data Frame.
Data = pd.read_csv('/home/prajwal/Desktop/bank-additional/bank-additional-full.csv',delimiter=';',header=0)

#Encoding the data, encoding the string values into numerical values.
encode = preprocessing.LabelEncoder()

#Selcting the columns of string data type
names = Data.select_dtypes(include=['object'])

#Function that encodes the string values to numerical values.
def enc(data,column):
    data[column] = encode.fit_transform(data[column])
    return data
for column in names:
        Data = enc(Data,column)
        
#Splitting the data into training and testing datasets
Data, test = train_test_split(Data, test_size = 0.05,stratify=Data['y'])

#Initializing two data frames that will be used as training data for the stacked model.
stack_X = pd.DataFrame() #The data frame will contain the predictions of the base models.
stack_Y = pd.DataFrame() #The data frame will contain the calss labels of the base models.

#Initializing two data frames that will be used as training data for the blending model.
blend_X = pd.DataFrame() #The data frames will contain the predictions and raw features  of the base models.
raw_features_X = pd.DataFrame() #The data frames will contain the raw features  of the data, which will be concatenated with the predictions.

In [3]:
#This function is used to convert the predictions of the base models into a DataFrame.
def build_data_frame(data):
    data_frame = pd.DataFrame(data).T
    return data_frame

In [4]:
#Defining the parameters for the XGBoost (Gradient Boosting) Algorithm.
def param_set():
    #Gradient Boosting (XGBoost)
    param = {}
    #Setting Parameters for the Booster
    param['booster'] = 'gbtree'
    param['objective'] = 'binary:logistic'
    param["eval_metric"] = "auc"
    param['eta'] = 0.3
    param['gamma'] = 0
    param['max_depth'] = 6
    param['min_child_weight'] = 1
    param['max_delta_step'] = 0
    param['subsample'] = 1
    param['colsample_bytree'] = 1
    param['silent'] = 1
    param['seed'] = 0
    param['base_score'] = 0.5
    param['lambda_bias'] = 1
    return param

In [5]:
#This function is used to train the base and stacking models. Returns all the models to be used for further computations.
def train_models(train_X,train_Y,model):
    
    param = param_set()
    #Trains only the base models.
    if(model=='base'):
        
        #Gradient Boosting
        dtrain = xgb.DMatrix(train_X,label=train_Y)
        gradient_boosting = xgb.train(param, dtrain)
        
        #Multi Layer Perceptron
        multi_layer_perceptron = Sequential()
        multi_layer_perceptron.add(Dense(output_dim = 64, input_dim = 20, init = 'uniform', activation = 'sigmoid'))
        multi_layer_perceptron.add(Dense(output_dim = 1, input_dim = 64,activation = 'sigmoid',))
        multi_layer_perceptron.compile(optimizer = 'rmsprop',loss = 'binary_crossentropy',metrics = ['accuracy'])
        multi_layer_perceptron.fit(train_X.as_matrix(), train_Y.as_matrix(), nb_epoch = 5, batch_size = 128)
        
        #Decision Tree
        decision_tree = DecisionTreeClassifier(max_depth = 6)
        decision_tree.fit(train_X,train_Y)
        
        #Random Forest (Deafult=10 Trees)
        random_forest = RandomForestClassifier()
        random_forest.fit(train_X,train_Y)
        
        #Scaling the data
        train_X = preprocessing.StandardScaler().fit_transform(train_X) 
        
        #Linear Regression
        linear_regression = linear_model.LinearRegression()
        linear_regression.fit(train_X,train_Y)
        
        #Logistic Regression (L1)
        logistic_regression_L1 = linear_model.LogisticRegression(penalty = 'l1')
        logistic_regression_L1.fit(train_X,train_Y)
        
        #Logistic Regression (L2)
        logistic_regression_L2 = linear_model.LogisticRegression(penalty = 'l2')
        logistic_regression_L2.fit(train_X,train_Y)
        
        #Returns a dictionary containing the model names and their respective models.
        return {'XGBoost':gradient_boosting,'Multi Layer Perceptron':multi_layer_perceptron,'Decision Tree':decision_tree,
           'Random Forest':random_forest,'Linear Regression':linear_regression,'L1':logistic_regression_L1,
            'L2':logistic_regression_L2}
    
    #Trains the stacking model (Gradient Boosting - XGBoost)
    elif(model == 'stack'):
        
        dtrain = xgb.DMatrix(train_X,label = train_Y)
        stack = xgb.train(param, dtrain)
        return {'Stack':stack}
    
    #Trains the blending model (Gradient Boosting - XGBoost)
    else:
        
        dtrain = xgb.DMatrix(train_X,label = train_Y)
        blend = xgb.train(param, dtrain)
        return {'Blend':blend}

In [6]:
#Function calculates area under the curve and predictions on the given data, for the model specified.
def cross_validation(model_name,model,cross_val_X,cross_val_Y):
    
    if(model_name == 'Gradient Boosting' or model_name == 'Linear Regression'):
        
        predict = model.predict(cross_val_X)
        
    elif(model_name == 'Multi Layer Perceptron'):
        
        predict = model.predict_on_batch(cross_val_X)
    else:
        
        predict = model.predict_proba(cross_val_X)[:,1]
        
    auc = roc_auc_score(cross_val_Y,predict)
    
    return[auc,predict]

In [7]:
#Initialzing the variables that will be used to calculate the area under the curve. (cross Validation Data)
metric_linear_regression=list()
avg_linear_regeression=0
metric_logistic_regression_L2=list()
avg_logistic_regression_L2=0
metric_logistic_regression_L1=list()
avg_logistic_regression_L1=0
metric_decision_tree=list()
avg_decision_tree=0
metric_random_forest=list()
avg_random_forest=0
metric_multi_layer_perceptron=list()
avg_multi_layer_perceptron=0
metric_gradient_boosting=list()
avg_gradient_boosting=0

In [8]:
#Cross Validation using Stratified K Fold
kf = StratifiedKFold(Data['y'], n_folds=5, shuffle=True)

In [9]:
#Training the base models, and calculating AUC on the cross validation data.
for train_index, cross_val_index in kf:
    
    #Selecting the data (Traing Data & Cross Validation Data)
    train, cross_val = Data.iloc[train_index], Data.iloc[cross_val_index]
    train_Y=train['y']
    train_X=train.drop(['y'],axis=1)
    cross_val_Y=cross_val['y']
    cross_val_X=cross_val.drop(['y'],axis=1)
    scale=preprocessing.StandardScaler()
    
    #Training the base models, the resulting model names and models are stored in the variable model in the from of a dictionary.
    model=train_models(train_X,train_Y,'base')
  
    #Gradient Boosting (XGBoost)
    #The AUC error (Cross Validation Data)
    [auc,predict_gradient_boosting]=cross_validation('Gradient Boosting',model['XGBoost'],xgb.DMatrix(cross_val_X,label=cross_val_Y),cross_val_Y)
    metric_gradient_boosting.append(auc)

    #Multi Layer Perceptron
    #The AUC (Cross Validation Data)
    predict_mlp=list()
    [auc,predict_multi_layer_perceptron]=cross_validation('Multi Layer Perceptron',model['Multi Layer Perceptron'],cross_val_X,cross_val_Y)
    metric_multi_layer_perceptron.append(auc)
    #predict_multi_layer_perceptron returns a list of lists containing the predictions, this cannot be converted to a dataframe.
    #This inner lists are converted to floats and then used to convert it to a dataframe.
    for i in predict_multi_layer_perceptron:
        predict_mlp.append(float(i))
    
    #Decision Tree)
    #The AUC (Cross Validation Data)
    [auc,predict_decision_tree]=cross_validation('Decision Tree',model['Decision Tree'],cross_val_X,cross_val_Y)
    metric_decision_tree.append(auc)
    
    #Random Forest (Deafult=10 Trees)
    #The AUC (Cross Validation Data)
    [auc,predict_random_forest]=cross_validation('Random Forest',model['Random Forest'],cross_val_X,cross_val_Y)
    metric_random_forest.append(auc)
    
    #Scaling the cross validation data.
    cross_val_X=scale.fit_transform(cross_val_X)
    
    #Linear Regression
    #The AUC (Cross Validation Data)
    [auc,predict_linear_regression]=cross_validation('Linear Regression',model['Linear Regression'],cross_val_X,cross_val_Y)
    metric_linear_regression.append(auc)
    
    #Logistic Regression (Default=l2)
    #The AUC (Cross Validation Data)
    [auc,predict_logistic_regression_L2]=cross_validation('L2',model['L2'],cross_val_X,cross_val_Y)
    metric_logistic_regression_L2.append(auc)
    
    #Logistic Regression-L1
    #The AUC (Cross Validation Data)
    [auc,predict_logistic_regression_L1]=cross_validation('L1',model['L1'],cross_val_X,cross_val_Y)
    metric_logistic_regression_L1.append(auc)
    
    #Building a list that contains all the predictions of the base models.
    predict_list=[predict_gradient_boosting,predict_decision_tree,predict_random_forest, 
                               predict_linear_regression,predict_logistic_regression_L2,
                               predict_logistic_regression_L1,predict_mlp]
    
    #Rescaling the cross validation data back to its original values.
    cross_val_X=scale.inverse_transform(cross_val_X)
    
    #Converting the above list of predictions into a dataframe, which will be used to train the stacking model.
    stack_Y=stack_Y.append(cross_val_Y.tolist())
    stack_X=stack_X.append(build_data_frame(predict_list))
    
    #Building a list that contains all the raw features used as cross validation data for the base models.
    raw_features_X=raw_features_X.append(cross_val_X.tolist())


Epoch 1/5
31302/31302 [==============================] - 0s - loss: 0.3660 - acc: 0.8812     
Epoch 2/5
31302/31302 [==============================] - 0s - loss: 0.3424 - acc: 0.8874     
Epoch 3/5
31302/31302 [==============================] - 0s - loss: 0.3392 - acc: 0.8874     
Epoch 4/5
31302/31302 [==============================] - 0s - loss: 0.3362 - acc: 0.8874     
Epoch 5/5
31302/31302 [==============================] - 0s - loss: 0.3282 - acc: 0.8874     
Epoch 1/5
31302/31302 [==============================] - 0s - loss: 0.3841 - acc: 0.8358     
Epoch 2/5
31302/31302 [==============================] - 0s - loss: 0.2690 - acc: 0.8875     
Epoch 3/5
31302/31302 [==============================] - 0s - loss: 0.2569 - acc: 0.8978     
Epoch 4/5
31302/31302 [==============================] - 0s - loss: 0.2540 - acc: 0.9012     
Epoch 5/5
31302/31302 [==============================] - 0s - loss: 0.2483 - acc: 0.9025     
Epoch 1/5
31302/31302 [==============================] - 0s - loss: 0.3304 - acc: 0.8816     
Epoch 2/5
31302/31302 [==============================] - 0s - loss: 0.2661 - acc: 0.8886     
Epoch 3/5
31302/31302 [==============================] - 0s - loss: 0.2581 - acc: 0.8984     
Epoch 4/5
31302/31302 [==============================] - 0s - loss: 0.2559 - acc: 0.9003     
Epoch 5/5
31302/31302 [==============================] - 0s - loss: 0.2495 - acc: 0.9047     
Epoch 1/5
31303/31303 [==============================] - 0s - loss: 0.3044 - acc: 0.8873     
Epoch 2/5
31303/31303 [==============================] - 0s - loss: 0.2786 - acc: 0.8874     
Epoch 3/5
31303/31303 [==============================] - 0s - loss: 0.2667 - acc: 0.8876     
Epoch 4/5
31303/31303 [==============================] - 0s - loss: 0.2609 - acc: 0.8912     
Epoch 5/5
31303/31303 [==============================] - 0s - loss: 0.2554 - acc: 0.8952     
Epoch 1/5
31303/31303 [==============================] - 0s - loss: 0.3272 - acc: 0.8845     
Epoch 2/5
31303/31303 [==============================] - 0s - loss: 0.2764 - acc: 0.8874     
Epoch 3/5
31303/31303 [==============================] - 0s - loss: 0.2645 - acc: 0.8914     
Epoch 4/5
31303/31303 [==============================] - 0s - loss: 0.2598 - acc: 0.8920     
Epoch 5/5
31303/31303 [==============================] - 0s - loss: 0.2562 - acc: 0.8929     

In [10]:
#Calculating the average AUC across all the AUC computed on the cross validation folds.
avg_linear_regression=np.mean(metric_linear_regression)
avg_logistic_regression_L2=np.mean(metric_logistic_regression_L2)
avg_logistic_regression_L1=np.mean(metric_logistic_regression_L1)
avg_decision_tree=np.mean(metric_decision_tree)
avg_random_forest=np.mean(metric_random_forest)
avg_multi_layer_perceptron=np.mean(metric_multi_layer_perceptron)
avg_gradient_boosting=np.mean(metric_gradient_boosting)

In [11]:
#Printing the AUC for the base models.
print (' AUC (Linear Regression)\n',avg_linear_regression)
print (' AUC (Logistic Regression - L2)\n',avg_logistic_regression_L2)
print (' AUC (Logistic Regression - L1)\n',avg_logistic_regression_L1)
print (' AUC (Decision Tree)\n',avg_decision_tree)
print (' AUC (Random Forest)\n',avg_random_forest)
print (' AUC (Multi Layer Perceptron)\n',avg_multi_layer_perceptron)
print (' AUC (Gradient Boosting - XGBoost)\n',avg_gradient_boosting)


 AUC (Linear Regression)
 0.927633509729
 AUC (Logistic Regression - L2)
 0.929307749504
 AUC (Logistic Regression - L1)
 0.929319089088
 AUC (Decision Tree)
 0.925024685812
 AUC (Random Forest)
 0.917223061441
 AUC (Multi Layer Perceptron)
 0.879044122467
 AUC (Gradient Boosting - XGBoost)
 0.948086039187

In [12]:
#Training the stacking model(XGBoost-Gradient Boosting)
model_stack=train_models(stack_X,stack_Y,'stack')

#Converting the above list of predictions and raw features (Concatenate) into a dataframe, which will be used to train the blending model.
blend_X=pd.concat([raw_features_X, stack_X], axis=1,ignore_index=True)

#Training the blending model(XGBoost-Gradient Boosting)
model_blend=train_models(blend_X,stack_Y,'blend')

In [13]:
#Initialzing the variables that will be used to calculate the area under the curve. (Test Data)
metric_logistic_regression_L2=list()
metric_logistic_regression_L1=list()
metric_decision_tree=list()
metric_random_forest=list()
metric_multi_layer_perceptron=list()
metric_gradient_boosting=list()
metric_stack=list()
metric_blend=list()
blend_X = pd.DataFrame()
raw_features_X = pd.DataFrame()

In [14]:
#Calculating AUC for all the models (Base Models & Stack Model) on the test data.

#Selecting the test data
test_Y=test['y']
test_X=test.drop(['y'],axis=1)
scale=preprocessing.StandardScaler()
    
#Gradient Boosting (XGBoost)
#The AUC error (Test Data)
[auc,predict_XGB]=cross_validation('Gradient Boosting',model['XGBoost'],xgb.DMatrix(test_X,label=test_Y),test_Y)
metric_gradient_boosting=(auc)

    
#Multi Layer Perceptron
#The AUC (Test Data)
predict_mlp=list()
[auc,predict_multi_layer_perceptron]=cross_validation('Multi Layer Perceptron',model['Multi Layer Perceptron'],test_X,test_Y)
metric_multi_layer_perceptron=(auc)

#predict_multi_layer_perceptron returns a list of lists containing the predictions, this cannot be converted to a dataframe.
#This inner lists are converted to floats and then used to convert it to a dataframe.
for i in predict_multi_layer_perceptron:
        predict_mlp.append(float(i))


#Decision Tree)
#The AUC (Test Data)
[auc,predict_decision_tree]=cross_validation('Decision Tree',model['Decision Tree'],test_X,test_Y)
metric_decision_tree=(auc)
    
    
#Random Forest (Deafult=10 Trees)
#The AUC (Test Data)
[auc,predict_random_forest]=cross_validation('Random Forest',model['Random Forest'],test_X,test_Y)
metric_random_forest=(auc)
    
test_X=scale.fit_transform(test_X)
#Linear Regression
#The AUC (Test Data)
[auc,predict_linear_regression]=cross_validation('Linear Regression',model['Linear Regression'],test_X,test_Y)
metric_linear_regression=(auc)
    
#Logistic Regression (Default=l2)
#The AUC (Test Data)
[auc,predict_logistic_regression_L2]=cross_validation('L2',model['L2'],test_X,test_Y)
metric_logistic_regression_L2=(auc)

#Logistic Regression-L1
#The AUC (Test Data)
[auc,predict_logistic_regression_L1]=cross_validation('L1',model['L1'],test_X,test_Y)
metric_logistic_regression_L1=(auc)

#Building a list that contains all the predictions of the base models.
predict_list=[predict_XGB,predict_decision_tree,predict_random_forest, 
                               predict_linear_regression,predict_logistic_regression_L2,
                               predict_logistic_regression_L1,predict_mlp]

#Rescaling the test data back to its original values.
test_X=scale.inverse_transform(test_X)
    
#Stacking (XGBoost - Gradient Boosting)
dstack_X=build_data_frame(predict_list) #Converting the list of predictions into a dataframe.
[auc,predict_stack]=cross_validation('Gradient Boosting',model_stack['Stack'],xgb.DMatrix(dstack_X,label=test_Y),test_Y)
metric_stack=(auc)    

#Blending (XGBoost - Gradient Boosting)
raw_features_X=raw_features_X.append(test_X.tolist())
blend_X=pd.concat([raw_features_X, dstack_X], axis=1,ignore_index=True)#Converting the above list of predictions and raw features (Concatenate) into a dataframe
[auc,predict_blend]=cross_validation('Gradient Boosting',model_blend['Blend'],xgb.DMatrix(blend_X,label=test_Y),test_Y)
metric_blend=(auc)

In [15]:
print (' AUC (Linear Regression)\n',metric_linear_regression)
print (' AUC (Logistic Regression - L2)\n',metric_logistic_regression_L2)
print (' AUC (Logistic Regression - L1)\n',metric_logistic_regression_L1)
print (' AUC (Decision Tree)\n',metric_decision_tree)
print (' AUC (Random Forest)\n',metric_random_forest)
print (' AUC (Multi Layer Perceptron)\n',metric_multi_layer_perceptron)
print (' AUC (Gradient Boosting - XGBoost)\n',metric_gradient_boosting)
print (' AUC (Stacking)\n',metric_stack)
print (' AUC (Blending)\n',metric_blend)


 AUC (Linear Regression)
 0.932326643024
 AUC (Logistic Regression - L2)
 0.934594997359
 AUC (Logistic Regression - L1)
 0.934630366709
 AUC (Decision Tree)
 0.928261525692
 AUC (Random Forest)
 0.914071342338
 AUC (Multi Layer Perceptron)
 0.881355872255
 AUC (Gradient Boosting - XGBoost)
 0.950255366709
 AUC (Stacking)
 0.949767269675
 AUC (Blending)
 0.951025239568