In [20]:
# This file is the very first try of lightGBM, compare with xgboost
# using single thread, wihout cross validation
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import lightgbm as lgb
import xgboost as xgb
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
In [2]:
data=pd.read_csv('adult.csv',header=None)
# assign column names to the data
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income']
data.head()
Out[2]:
In [3]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
# encode label
l=LabelEncoder()
l.fit(data.Income)
l.classes_
data.Income=Series(l.transform(data.Income))
In [4]:
data.Income.value_counts() # label has been encoded as 0, 1
Out[4]:
In [5]:
# convert categorical data into one-hot, and drop original categorical data
one_hot_workclass=pd.get_dummies(data.workclass)
one_hot_education=pd.get_dummies(data.education)
one_hot_marital_Status=pd.get_dummies(data.marital_Status)
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship)
one_hot_race=pd.get_dummies(data.race)
one_hot_sex=pd.get_dummies(data.sex)
one_hot_native_country=pd.get_dummies(data.native_country)
data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1,inplace=True)
In [6]:
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1)
In [7]:
#removing dulpicate columns
i = np.unique(data.columns, return_index=True)
i[1] # index of unique columns
Out[7]:
In [8]:
data=data.iloc[:, i[1]] # use the index of unique columns
data.head()
Out[8]:
In [9]:
features = data.drop('Income',axis=1)
label = data.Income
In [10]:
label.mode()[0]
label.fillna(label.mode()[0],inplace=True) # impute missing data with mode
In [11]:
label.value_counts()
Out[11]:
In [12]:
#Now splitting our dataset into test and train
from sklearn.model_selection import train_test_split
features_train,features_test,label_train,label_test=train_test_split(features,label,test_size=.3)
In [17]:
features_train.head()
features_test.head()
label_train.head()
label_test.head()
Out[17]:
In [28]:
# method 1 - XGBOOST, single thread
dtrain=xgb.DMatrix(features_train,label=label_train)
dtest=xgb.DMatrix(features_test)
## xgboost booster params
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}
In [29]:
num_round=50 # number of boosting iterations
start = datetime.now()
xg=xgb.train(parameters,dtrain,num_round) # train the model
stop = datetime.now()
execution_time_xgb = stop-start
print(execution_time_xgb) # 0:00:05.880084
In [30]:
ypred=xg.predict(dtest)
print(ypred)
In [31]:
#Converting probabilities into 1 or 0
for i in range(0,9769):
if ypred[i]>=.5: # setting threshold to .5
ypred[i]=1
else:
ypred[i]=0
In [32]:
accuracy_xgb = accuracy_score(label_test,ypred)
accuracy_xgb # 0.87030402292967546
Out[32]:
In [35]:
cm = confusion_matrix(label_test, ypred)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_xgb = (TP + TN)/(TP+FP+FN+TN) # accuracy: 0.87030402293
auc_score_xgb = roc_auc_score(label_test, ypred) # AUC: 0.777341026858
precision_xgb = TP/(TP+FP) # precision: 0.952047413793
specificity_xgb = TN/(TN+FP) # specificity: 0.797612279704
recall_xgb = TP/(TP+FN) # recall: 0.885017421603
print(auc_score_xgb)
In [13]:
# method 2 - Light GBM, single thread (make -j)
dtrain = lgb.Dataset(features_train,label=label_train)
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']
In [16]:
# train the model
num_round=50
start=datetime.now()
lgbm=lgb.train(param,dtrain,num_round)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:01.107647
In [17]:
# predict
ypred2=lgbm.predict(features_test)
ypred2[0:5] # showing first 5 predictions
Out[17]:
In [19]:
#Converting probabilities into 1 or 0
for i in range(0,9769):
if ypred2[i]>=.5: # setting threshold to .5
ypred2[i]=1
else:
ypred2[i]=0
In [21]:
accuracy_lgb = accuracy_score(label_test,ypred2)
accuracy_lgb # 0.87081584604360729
Out[21]:
In [36]:
cm = confusion_matrix(label_test, ypred2)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN) # accuracy: 0.870815846044
auc_score_lgb = roc_auc_score(label_test, ypred2) # AUC: 0.777080344354
precision_lgb = TP/(TP+FP) # precision: 0.955197853789
specificity_lgb = TN/(TN+FP) # specificity: 0.805813953488
recall_lgb = TP/(TP+FN) # recall: 0.88470617468
print(recall_lgb)
In [38]:
# compare single thread light GBM and single thread xgboost
comparison_dict = {'accuracy score':(accuracy_lgb,accuracy_xgb),'auc score':(auc_score_lgb,auc_score_xgb),'execution time':(execution_time_lgbm,execution_time_xgb),
'precision':(precision_lgb, precision_xgb), 'specificity':(specificity_lgb, specificity_xgb), 'recall':(recall_lgb, recall_xgb)}
comparison_df = DataFrame(comparison_dict)
comparison_df.index= ['LightGBM','xgboost']
comparison_df
Out[38]:
In [ ]:
'''
NOTE: when you are using LightGBM/xgboost train() method, it does not give seed,
therefore, if you run it multiple times, you will get different results
'''