In [20]:
# This file is the very first try of lightGBM, compare with xgboost
# using single thread, wihout cross validation

import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 
import lightgbm as lgb 
import xgboost as xgb 
from datetime import datetime 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

In [2]:
data=pd.read_csv('adult.csv',header=None) 
# assign column names to the data
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income'] 
data.head()


Out[2]:
age workclass fnlwgt education education-num marital_Status occupation relationship race sex capital_gain capital_loss hours_per_week native_country Income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [3]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# encode label
l=LabelEncoder() 
l.fit(data.Income) 
l.classes_ 
data.Income=Series(l.transform(data.Income))

In [4]:
data.Income.value_counts() # label has been encoded as 0, 1


Out[4]:
0    24720
1     7841
Name: Income, dtype: int64

In [5]:
# convert categorical data into one-hot, and drop original categorical data
one_hot_workclass=pd.get_dummies(data.workclass) 
one_hot_education=pd.get_dummies(data.education) 
one_hot_marital_Status=pd.get_dummies(data.marital_Status) 
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship) 
one_hot_race=pd.get_dummies(data.race) 
one_hot_sex=pd.get_dummies(data.sex) 
one_hot_native_country=pd.get_dummies(data.native_country) 

data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1,inplace=True)

In [6]:
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1)

In [7]:
#removing dulpicate columns 
i = np.unique(data.columns, return_index=True) 
i[1]  # index of unique columns


Out[7]:
array([ 16,  17,  18,  19,  20,  21,  22,   7,  40,  60,  41,  61,  23,
        24,  25,  62,  68,  69,  70,  71,  42,  72,  32,  26,  73,  74,
        75,  76,  43,  44,   8,  65,  77,  78,  79,  80,  27,  81,  45,
        82,  83,  84,  85,  54,  86,  87,  88,  89,  90,  91,  92,   9,
        46,  66,  33,  34,  35,  28,  93,  36,  10,  94,  55,  63,  56,
        47,  95,  57,  96,  97,  98,  99,  29,  48,  11,  30,  49,  50,
       100,  51, 101,  12,  13,  37,  31, 102,  14, 103,  52, 104,  53,
       105, 106,  58, 107,  64,  38,  59,  15, 108,   6,   0,   3,   4,
         2,   1,   5])

In [8]:
data=data.iloc[:, i[1]]  # use the index of unique columns
data.head()


Out[8]:
10th 11th 12th 1st-4th 5th-6th 7th-8th 9th ? Adm-clerical Amer-Indian-Eskimo ... Wife Without-pay Yugoslavia Income age capital_gain capital_loss education-num fnlwgt hours_per_week
0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 39 2174 0 13 77516 40
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 50 0 0 13 83311 13
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 38 0 0 9 215646 40
3 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 53 0 0 7 234721 40
4 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 28 0 0 13 338409 40

5 rows × 107 columns


In [9]:
features = data.drop('Income',axis=1) 
label = data.Income

In [10]:
label.mode()[0]
label.fillna(label.mode()[0],inplace=True)  # impute missing data with mode

In [11]:
label.value_counts()


Out[11]:
0    24720
1     7841
Name: Income, dtype: int64

In [12]:
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 

features_train,features_test,label_train,label_test=train_test_split(features,label,test_size=.3)

In [17]:
features_train.head()
features_test.head()
label_train.head()
label_test.head()


Out[17]:
19851    0
15010    0
5949     0
668      0
30677    0
Name: Income, dtype: int64

In [28]:
# method 1 - XGBOOST, single thread
dtrain=xgb.DMatrix(features_train,label=label_train)
dtest=xgb.DMatrix(features_test)
## xgboost booster params
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}

In [29]:
num_round=50    # number of boosting iterations
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round)  # train the model
stop = datetime.now()
execution_time_xgb = stop-start 
print(execution_time_xgb)   # 0:00:05.880084


0:00:05.880084

In [30]:
ypred=xg.predict(dtest) 
print(ypred)


[ 0.05520488  0.44592044  0.12861401 ...,  0.04373964  0.04373964
  0.32243791]

In [31]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0

In [32]:
accuracy_xgb = accuracy_score(label_test,ypred) 
accuracy_xgb  # 0.87030402292967546


Out[32]:
0.87030402292967546

In [35]:
cm = confusion_matrix(label_test, ypred)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)   # accuracy: 0.87030402293
auc_score_xgb = roc_auc_score(label_test, ypred)   # AUC: 0.777341026858
precision_xgb = TP/(TP+FP)   # precision: 0.952047413793
specificity_xgb = TN/(TN+FP)  # specificity: 0.797612279704
recall_xgb = TP/(TP+FN)   # recall: 0.885017421603
print(auc_score_xgb)


0.777341026858

In [13]:
# method 2 - Light GBM, single thread (make -j)

dtrain = lgb.Dataset(features_train,label=label_train)
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']

In [16]:
# train the model

num_round=50
start=datetime.now()
lgbm=lgb.train(param,dtrain,num_round)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm)    # 0:00:01.107647


0:00:01.107647

In [17]:
# predict 

ypred2=lgbm.predict(features_test)
ypred2[0:5]  # showing first 5 predictions


Out[17]:
array([ 0.0568377 ,  0.44797529,  0.13431321,  0.04504189,  0.06630088])

In [19]:
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred2[i]>=.5:       # setting threshold to .5 
       ypred2[i]=1 
    else: 
       ypred2[i]=0

In [21]:
accuracy_lgb = accuracy_score(label_test,ypred2) 
accuracy_lgb  # 0.87081584604360729


Out[21]:
0.87081584604360729

In [36]:
cm = confusion_matrix(label_test, ypred2)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)   # accuracy: 0.870815846044
auc_score_lgb = roc_auc_score(label_test, ypred2)   # AUC: 0.777080344354
precision_lgb = TP/(TP+FP)   # precision: 0.955197853789
specificity_lgb = TN/(TN+FP)  # specificity: 0.805813953488
recall_lgb = TP/(TP+FN)   # recall: 0.88470617468
print(recall_lgb)


0.88470617468

In [38]:
# compare single thread light GBM and single thread xgboost
comparison_dict = {'accuracy score':(accuracy_lgb,accuracy_xgb),'auc score':(auc_score_lgb,auc_score_xgb),'execution time':(execution_time_lgbm,execution_time_xgb),
                  'precision':(precision_lgb, precision_xgb), 'specificity':(specificity_lgb, specificity_xgb), 'recall':(recall_lgb, recall_xgb)}
comparison_df = DataFrame(comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df


Out[38]:
accuracy score auc score execution time precision recall specificity
LightGBM 0.870816 0.777080 00:00:01.107647 0.955198 0.884706 0.805814
xgboost 0.870304 0.777341 00:00:05.880084 0.953991 0.885017 0.802077

In [ ]:
'''
NOTE: when you are using LightGBM/xgboost train() method, it does not give seed, 
therefore, if you run it multiple times, you will get different results
'''