notebook.community

Edit and run



In [20]:

    
# This file is the very first try of lightGBM, compare with xgboost
# using single thread, wihout cross validation

import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 
import lightgbm as lgb 
import xgboost as xgb 
from datetime import datetime 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score



In [2]:

    
data=pd.read_csv('adult.csv',header=None) 
# assign column names to the data
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income'] 
data.head()









    Out[2]:







  
    
      
      age
      workclass
      fnlwgt
      education
      education-num
      marital_Status
      occupation
      relationship
      race
      sex
      capital_gain
      capital_loss
      hours_per_week
      native_country
      Income
    
  
  
    
      0
      39
      State-gov
      77516
      Bachelors
      13
      Never-married
      Adm-clerical
      Not-in-family
      White
      Male
      2174
      0
      40
      United-States
      <=50K
    
    
      1
      50
      Self-emp-not-inc
      83311
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      13
      United-States
      <=50K
    
    
      2
      38
      Private
      215646
      HS-grad
      9
      Divorced
      Handlers-cleaners
      Not-in-family
      White
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      3
      53
      Private
      234721
      11th
      7
      Married-civ-spouse
      Handlers-cleaners
      Husband
      Black
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      4
      28
      Private
      338409
      Bachelors
      13
      Married-civ-spouse
      Prof-specialty
      Wife
      Black
      Female
      0
      0
      40
      Cuba
      <=50K



In [3]:

    
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# encode label
l=LabelEncoder() 
l.fit(data.Income) 
l.classes_ 
data.Income=Series(l.transform(data.Income))



In [4]:

    
data.Income.value_counts() # label has been encoded as 0, 1









    Out[4]:





0    24720
1     7841
Name: Income, dtype: int64



In [5]:

    
# convert categorical data into one-hot, and drop original categorical data
one_hot_workclass=pd.get_dummies(data.workclass) 
one_hot_education=pd.get_dummies(data.education) 
one_hot_marital_Status=pd.get_dummies(data.marital_Status) 
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship) 
one_hot_race=pd.get_dummies(data.race) 
one_hot_sex=pd.get_dummies(data.sex) 
one_hot_native_country=pd.get_dummies(data.native_country) 

data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1,inplace=True)



In [6]:

    
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1)



In [7]:

    
#removing dulpicate columns 
i = np.unique(data.columns, return_index=True) 
i[1]  # index of unique columns









    Out[7]:





array([ 16,  17,  18,  19,  20,  21,  22,   7,  40,  60,  41,  61,  23,
        24,  25,  62,  68,  69,  70,  71,  42,  72,  32,  26,  73,  74,
        75,  76,  43,  44,   8,  65,  77,  78,  79,  80,  27,  81,  45,
        82,  83,  84,  85,  54,  86,  87,  88,  89,  90,  91,  92,   9,
        46,  66,  33,  34,  35,  28,  93,  36,  10,  94,  55,  63,  56,
        47,  95,  57,  96,  97,  98,  99,  29,  48,  11,  30,  49,  50,
       100,  51, 101,  12,  13,  37,  31, 102,  14, 103,  52, 104,  53,
       105, 106,  58, 107,  64,  38,  59,  15, 108,   6,   0,   3,   4,
         2,   1,   5])



In [8]:

    
data=data.iloc[:, i[1]]  # use the index of unique columns
data.head()









    Out[8]:







  
    
      
      10th
      11th
      12th
      1st-4th
      5th-6th
      7th-8th
      9th
      ?
      Adm-clerical
      Amer-Indian-Eskimo
      ...
      Wife
      Without-pay
      Yugoslavia
      Income
      age
      capital_gain
      capital_loss
      education-num
      fnlwgt
      hours_per_week
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      39
      2174
      0
      13
      77516
      40
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      50
      0
      0
      13
      83311
      13
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      38
      0
      0
      9
      215646
      40
    
    
      3
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      53
      0
      0
      7
      234721
      40
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      28
      0
      0
      13
      338409
      40
    
  

5 rows × 107 columns



In [9]:

    
features = data.drop('Income',axis=1) 
label = data.Income



In [10]:

    
label.mode()[0]
label.fillna(label.mode()[0],inplace=True)  # impute missing data with mode



In [11]:

    
label.value_counts()









    Out[11]:





0    24720
1     7841
Name: Income, dtype: int64



In [12]:

    
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 

features_train,features_test,label_train,label_test=train_test_split(features,label,test_size=.3)



In [17]:

    
features_train.head()
features_test.head()
label_train.head()
label_test.head()









    Out[17]:





19851    0
15010    0
5949     0
668      0
30677    0
Name: Income, dtype: int64



In [28]:

    
# method 1 - XGBOOST, single thread
dtrain=xgb.DMatrix(features_train,label=label_train)
dtest=xgb.DMatrix(features_test)
## xgboost booster params
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}



In [29]:

    
num_round=50    # number of boosting iterations
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round)  # train the model
stop = datetime.now()
execution_time_xgb = stop-start 
print(execution_time_xgb)   # 0:00:05.880084









    



0:00:05.880084



In [30]:

    
ypred=xg.predict(dtest) 
print(ypred)









    



[ 0.05520488  0.44592044  0.12861401 ...,  0.04373964  0.04373964
  0.32243791]



In [31]:

    
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0



In [32]:

    
accuracy_xgb = accuracy_score(label_test,ypred) 
accuracy_xgb  # 0.87030402292967546









    Out[32]:





0.87030402292967546



In [35]:

    
cm = confusion_matrix(label_test, ypred)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)   # accuracy: 0.87030402293
auc_score_xgb = roc_auc_score(label_test, ypred)   # AUC: 0.777341026858
precision_xgb = TP/(TP+FP)   # precision: 0.952047413793
specificity_xgb = TN/(TN+FP)  # specificity: 0.797612279704
recall_xgb = TP/(TP+FN)   # recall: 0.885017421603
print(auc_score_xgb)









    



0.777341026858



In [13]:

    
# method 2 - Light GBM, single thread (make -j)

dtrain = lgb.Dataset(features_train,label=label_train)
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']



In [16]:

    
# train the model

num_round=50
start=datetime.now()
lgbm=lgb.train(param,dtrain,num_round)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm)    # 0:00:01.107647









    



0:00:01.107647



In [17]:

    
# predict 

ypred2=lgbm.predict(features_test)
ypred2[0:5]  # showing first 5 predictions









    Out[17]:





array([ 0.0568377 ,  0.44797529,  0.13431321,  0.04504189,  0.06630088])



In [19]:

    
#Converting probabilities into 1 or 0  
for i in range(0,9769): 
    if ypred2[i]>=.5:       # setting threshold to .5 
       ypred2[i]=1 
    else: 
       ypred2[i]=0



In [21]:

    
accuracy_lgb = accuracy_score(label_test,ypred2) 
accuracy_lgb  # 0.87081584604360729









    Out[21]:





0.87081584604360729



In [36]:

    
cm = confusion_matrix(label_test, ypred2)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)   # accuracy: 0.870815846044
auc_score_lgb = roc_auc_score(label_test, ypred2)   # AUC: 0.777080344354
precision_lgb = TP/(TP+FP)   # precision: 0.955197853789
specificity_lgb = TN/(TN+FP)  # specificity: 0.805813953488
recall_lgb = TP/(TP+FN)   # recall: 0.88470617468
print(recall_lgb)









    



0.88470617468



In [38]:

    
# compare single thread light GBM and single thread xgboost
comparison_dict = {'accuracy score':(accuracy_lgb,accuracy_xgb),'auc score':(auc_score_lgb,auc_score_xgb),'execution time':(execution_time_lgbm,execution_time_xgb),
                  'precision':(precision_lgb, precision_xgb), 'specificity':(specificity_lgb, specificity_xgb), 'recall':(recall_lgb, recall_xgb)}
comparison_df = DataFrame(comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df









    Out[38]:







  
    
      
      accuracy score
      auc score
      execution time
      precision
      recall
      specificity
    
  
  
    
      LightGBM
      0.870816
      0.777080
      00:00:01.107647
      0.955198
      0.884706
      0.805814
    
    
      xgboost
      0.870304
      0.777341
      00:00:05.880084
      0.953991
      0.885017
      0.802077



In [ ]:

    
'''
NOTE: when you are using LightGBM/xgboost train() method, it does not give seed, 
therefore, if you run it multiple times, you will get different results
'''

	age	workclass	fnlwgt	education	education-num	marital_Status	occupation	relationship	race	sex	capital_gain	hours_per_week	native_country	Income
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

	accuracy score	auc score	execution time	precision	recall	specificity
LightGBM	0.870816	0.777080	00:00:01.107647	0.955198	0.884706	0.805814
xgboost	0.870304	0.777341	00:00:05.880084	0.953991	0.885017	0.802077