In [ ]:
# In this file, trying lightGBM with cross validation
# try the package in scikit-learn too
# single thread
# lgb document: http://lightgbm.readthedocs.io/en/latest/python/lightgbm.html#lightgbm-package
# xgb document: http://xgboost.readthedocs.io/en/latest/python/python_api.html
# LightGBM scikit-learn API: http://lightgbm.readthedocs.io/en/latest/python/lightgbm.html#scikit-learn-api
# XGBoost scikit-learn API: http://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
# both scikit-learn lightGBM, Xgboost provide regressor and classifier
In [13]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import lightgbm as lgb
import xgboost as xgb
from datetime import datetime
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV
from xgboost import plot_importance
from lightgbm import plot_importance as lgb_plot_importance
%matplotlib inline
In [2]:
data=pd.read_csv('adult.csv',header=None)
# assign column names to the data
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','Income']
data.head()
Out[2]:
In [3]:
# data preprocessing
## encode label
l=LabelEncoder()
l.fit(data.Income)
l.classes_
data.Income=Series(l.transform(data.Income))
data.Income.value_counts()
Out[3]:
In [4]:
## convert categorical data into one-hot, and drop original categorical data
one_hot_workclass=pd.get_dummies(data.workclass)
one_hot_education=pd.get_dummies(data.education)
one_hot_marital_Status=pd.get_dummies(data.marital_Status)
one_hot_occupation=pd.get_dummies(data.occupation)
one_hot_relationship=pd.get_dummies(data.relationship)
one_hot_race=pd.get_dummies(data.race)
one_hot_sex=pd.get_dummies(data.sex)
one_hot_native_country=pd.get_dummies(data.native_country)
data.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country'],axis=1,inplace=True)
In [5]:
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1)
## remove dulpicate columns
i = np.unique(data.columns, return_index=True)
data=data.iloc[:, i[1]] # use the index of unique columns
data.head()
Out[5]:
In [6]:
features = data.drop('Income',axis=1)
label = data.Income
In [7]:
label.mode()[0]
label.fillna(label.mode()[0],inplace=True) # impute missing data with mode
In [8]:
label.value_counts()
Out[8]:
In [9]:
features_train,features_test,label_train,label_test=train_test_split(features,label,test_size=.3)
In [95]:
# method 1 - xgboost, with cross validation
dtrain=xgb.DMatrix(features_train,label=label_train)
dtest=xgb.DMatrix(features_test)
## booster params
booster_params = {'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}
In [49]:
num_boost_round = 50
nfold = 5
metrics = ('auc', 'logloss')
seed = 410
xgb_cv = xgb.cv(booster_params, dtrain, num_boost_round, nfold, metrics, seed)
print(xgb_cv.shape)
In [48]:
xgb_cv # it returns the optimum number of trees required (n_estimators)
Out[48]:
In [92]:
# CV method 1 - xgboost cv() method
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=10, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = dtrain
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics=('auc'), early_stopping_rounds=early_stopping_rounds, seed = 410)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg_fit = alg.fit(features_train, label_train, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(features_train)
dtrain_predprob = alg.predict_proba(features_train)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(label_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(label_train, dtrain_predprob))
plot_importance(alg_fit, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', importance_type='weight', max_num_features=7, grid=True,)
return alg_fit
In [93]:
start=datetime.now()
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000, # default value here, will be changed by cvresult
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=410)
xgb_model = modelfit(xgb1, dtrain, features_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:20.234687
In [99]:
ypred = xgb_model.predict(features_test)
print(ypred)
In [102]:
accuracy_xgb = accuracy_score(label_test,ypred)
print("sklearn accuracy", accuracy_xgb)
cm = confusion_matrix(label_test, ypred)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_xgb = roc_auc_score(label_test, ypred)
precision_xgb = TP/(TP+FP)
specificity_xgb = TN/(TN+FP)
recall_xgb = TP/(TP+FN)
print("accuracy: ", accuracy_xgb)
print("AUC: ", auc_score_xgb)
print("Precision: ", precision_xgb)
print("Specificity: ", specificity_xgb)
print("Recall: ", recall_xgb)
In [109]:
# CV method 2 - GridSearchCV() method
## GridSearchCV params: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
param_set = {
'max_depth':[3,5,7,9],
'min_child_weight':[1,3,5] # smaller min_child_weight can handle smaller class in inbalanced dataset
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
In [110]:
xgb_model2 = gsearch.fit(features_train, label_train)
xgb_model2.grid_scores_, xgb_model2.best_params_, xgb_model2.best_score_
Out[110]:
In [111]:
# modify values with optimum values and tune other params
param_set = {
'gamma':[i/10.0 for i in range(0,5)]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model2 = gsearch.fit(features_train, label_train)
xgb_model2.grid_scores_, xgb_model2.best_params_, xgb_model2.best_score_
Out[111]:
In [112]:
# modify values with optimum values and tune other params
param_set = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0.3, subsample=0.8, colsample_bytree=0.8, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model2 = gsearch.fit(features_train, label_train)
xgb_model2.grid_scores_, xgb_model2.best_params_, xgb_model2.best_score_
Out[112]:
In [113]:
# subsample and colsample_bytree are choosing edge values, check in more detail
param_set = {
'subsample':[0.8, 0.9, 1.0],
'colsample_bytree':[i/10.0 for i in range(1,7)]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0.3, subsample=0.8, colsample_bytree=0.8, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model2 = gsearch.fit(features_train, label_train)
xgb_model2.grid_scores_, xgb_model2.best_params_, xgb_model2.best_score_
Out[113]:
In [119]:
# subsample and colsample_bytree are choosing edge
param_set = {
'subsample':[0.8, 0.9, 1.0],
'colsample_bytree':[i/100.0 for i in range(1,10,2)]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0.3, subsample=0.8, colsample_bytree=0.8, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model2 = gsearch.fit(features_train, label_train)
xgb_model2.grid_scores_, xgb_model2.best_params_, xgb_model2.best_score_ # got the same accuracy in fact
Out[119]:
In [120]:
start=datetime.now()
xgb2 = XGBClassifier(
learning_rate =0.1,
n_estimators=100,
max_depth=7,
min_child_weight=1,
gamma=0.3,
subsample=1.0,
colsample_bytree=0.1,
objective= 'binary:logistic',
nthread=7,
scale_pos_weight=1,
seed=410)
xgb_model2 = modelfit(xgb2, dtrain, features_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:07.11318
ypred = xgb_model2.predict(features_test)
print(ypred)
In [121]:
ypred2=xgb_model2.predict(features_test)
ypred2[0:5]
Out[121]:
In [122]:
cm = confusion_matrix(label_test, ypred2)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_xgb = roc_auc_score(label_test, ypred)
precision_xgb = TP/(TP+FP)
specificity_xgb = TN/(TN+FP)
recall_xgb = TP/(TP+FN)
print("accuracy: ", accuracy_xgb)
print("AUC: ", auc_score_xgb)
print("Precision: ", precision_xgb)
print("Specificity: ", specificity_xgb)
print("Recall: ", recall_xgb)
In [132]:
# modify values with optimum values and tune other params
param_set = {
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0.3, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model3 = gsearch.fit(features_train, label_train)
xgb_model3.grid_scores_, xgb_model3.best_params_, xgb_model3.best_score_
Out[132]:
In [133]:
# further tune reg_alpha value
param_set = {
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0.3, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model3 = gsearch.fit(features_train, label_train)
xgb_model3.grid_scores_, xgb_model3.best_params_, xgb_model3.best_score_
Out[133]:
In [130]:
start=datetime.now()
xgb3 = XGBClassifier(
learning_rate =0.1,
n_estimators=100,
max_depth=7,
min_child_weight=1,
gamma=0.3,
subsample=1.0,
colsample_bytree=0.1,
objective= 'binary:logistic',
nthread=7,
scale_pos_weight=1,
reg_alpha = 0,
seed=410)
xgb_model3 = modelfit(xgb3, dtrain, features_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:07.100202
ypred = xgb_model3.predict(features_test)
print(ypred)
In [131]:
ypred3=xgb_model3.predict(features_test)
cm = confusion_matrix(label_test, ypred3)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_xgb = roc_auc_score(label_test, ypred)
precision_xgb = TP/(TP+FP)
specificity_xgb = TN/(TN+FP)
recall_xgb = TP/(TP+FN)
print("accuracy: ", accuracy_xgb)
print("AUC: ", auc_score_xgb)
print("Precision: ", precision_xgb)
print("Specificity: ", specificity_xgb)
print("Recall: ", recall_xgb)
In [134]:
# tune scale_pos_weight
## A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
# tune learning rate
param_set = {
'scale_pos_weight':[i/10.0 for i in range(11)]
}
gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model4 = gsearch.fit(features_train, label_train)
xgb_model4.grid_scores_, xgb_model4.best_params_, xgb_model4.best_score_
Out[134]:
In [135]:
# tune learning rate
param_set = {
'learning_rate':[0.01, 0.05, 0.1]
}
gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model4 = gsearch.fit(features_train, label_train)
xgb_model4.grid_scores_, xgb_model4.best_params_, xgb_model4.best_score_
Out[135]:
In [136]:
# tune learning rate
param_set = {
'learning_rate':[i/10.0 for i in range(1,10)]
}
gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model4 = gsearch.fit(features_train, label_train)
xgb_model4.grid_scores_, xgb_model4.best_params_, xgb_model4.best_score_
Out[136]:
In [137]:
# tune n_estimators
param_set = {
'n_estimators':[50, 100, 500, 1000]
}
gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.2, n_estimators=100, max_depth=7,
min_child_weight=1, gamma=0, subsample=1.0, colsample_bytree=0.1, nthread=7,
objective= 'binary:logistic', scale_pos_weight=1, seed=410),
param_grid = param_set, scoring='roc_auc',n_jobs=7,iid=False, cv=10)
xgb_model4 = gsearch.fit(features_train, label_train)
xgb_model4.grid_scores_, xgb_model4.best_params_, xgb_model4.best_score_
Out[137]:
In [138]:
start=datetime.now()
xgb4 = XGBClassifier(
learning_rate =0.2,
n_estimators=100,
max_depth=7,
min_child_weight=1,
gamma=0,
subsample=1.0,
colsample_bytree=0.1,
objective= 'binary:logistic',
nthread=7,
scale_pos_weight=1,
reg_alpha = 0,
seed=410)
xgb_model4 = modelfit(xgb4, dtrain, features_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:07.033940
ypred = xgb_model4.predict(features_test)
print(ypred)
In [139]:
ypred4=xgb_model4.predict(features_test)
cm = confusion_matrix(label_test, ypred4)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_xgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_xgb = roc_auc_score(label_test, ypred)
precision_xgb = TP/(TP+FP)
specificity_xgb = TN/(TN+FP)
recall_xgb = TP/(TP+FN)
print("accuracy: ", accuracy_xgb)
print("AUC: ", auc_score_xgb)
print("Precision: ", precision_xgb)
print("Specificity: ", specificity_xgb)
print("Recall: ", recall_xgb)
# both precision and specificity dropped, while accuracy, AUC, recall increased
In [146]:
# LightGBM Cross Validation method 1 - cv()
dtrain = lgb.Dataset(features_train,label=label_train)
booster_params = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,'max_bin':200,'metric':['auc', 'binary_logloss']}
lgb_cv = lgb.cv(booster_params, dtrain, num_boost_round=50, folds=None, nfold=10, stratified=False, shuffle=True, metrics=['auc', 'binary_logloss'], fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=50, fpreproc=None, verbose_eval=None, show_stdv=True, seed=410, callbacks=None)
In [147]:
for k, v in lgb_cv.items():
print(k,v, len(v))
# It seems that the results are becomming better and better till 50 rounds, elts's choose 50 as n_estimators
In [174]:
n_estimators = 50
start=datetime.now()
lgb1 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1, n_estimators=n_estimators, max_bin=255,
subsample_for_bin=0.8, objective=None, min_split_gain=0,
min_child_weight=5, min_child_samples=10, subsample=1,
subsample_freq=1, colsample_bytree=1, reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True)
#Fit the algorithm on the data
lgb_fit = lgb1.fit(features_train, label_train, eval_metric='auc')
#Predict training set:
dtrain_predictions = lgb1.predict(features_train)
dtrain_predprob = lgb1.predict_proba(features_train)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(label_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(label_train, dtrain_predprob))
lgb_plot_importance(lgb_fit, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', importance_type='split', max_num_features=7, grid=True,)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.381134, much faster. Lower accuracy, maybe because of the param tuning is limited with cv()
In [10]:
def lgb_model_fit(alg, features_train, label_train):
#Fit the algorithm on the data
alg_fit = alg.fit(features_train, label_train, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(features_train)
dtrain_predprob = alg.predict_proba(features_train)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(label_train, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(label_train, dtrain_predprob))
lgb_plot_importance(alg_fit, ax=None, height=0.2, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', importance_type='split', max_num_features=7, grid=True,)
return alg_fit
In [183]:
# LightGBM cross validation method 2 - GridSearchCV()
# the code here will be running forever
param_set = {
'n_estimators':[20, 30, 40, 50, 60, 70]
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=50, max_bin=225, # choose a small values for bins to speed up
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[183]:
In [182]:
param_set = {
'n_estimators':[70, 100, 500, 1000]
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=50, max_bin=225, # choose a small values for bins to speed up
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[182]:
In [185]:
param_set = {
'max_depth':[5,7,9,11,13,15]
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[185]:
In [186]:
param_set = {
'max_depth':[1,2,3,4,5]
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[186]:
In [187]:
param_set = {
'num_leaves':[20, 25, 30, 35] # this value often starts with 2^max_depth
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[187]:
In [188]:
param_set = {
'min_child_samples':[10, 50, 100, 100] # this depends on num_leaves the number of data records.
# large value to avoid overfitting, but may lead to underfitting
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[188]:
In [189]:
param_set = {
'min_child_samples':[1, 3, 5, 7, 9, 10] # this depends on num_leaves the number of data records.
# large value to avoid overfitting, but may lead to underfitting
}
gsearch = GridSearchCV(estimator = LGBMClassifier( boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
# in fact here, mean, std didn't change
Out[189]:
In [190]:
param_set = {
'min_child_samples':[10, 11, 12, 13, 14, 15] # this depends on num_leaves the number of data records.
# large value to avoid overfitting, but may lead to underfitting
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, objective=None, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model2 = gsearch.fit(features_train, label_train)
lgb_model2.grid_scores_, lgb_model2.best_params_, lgb_model2.best_score_
Out[190]:
In [193]:
start=datetime.now()
lgb2 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1,
n_estimators=100,
max_bin=225,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=5,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model2 = lgb_model_fit(lgb2, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.635665
ypred = lgb_model2.predict(features_test)
print(ypred)
In [195]:
ypred2=lgb_model2.predict(features_test)
cm = confusion_matrix(label_test, ypred2)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
In [196]:
param_set = {
'max_bin':[50, 100, 200, 225, 300, 500, 1000],
'subsample_for_bin':[100, 500, 1000, 5000, 10000]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model3 = gsearch.fit(features_train, label_train)
lgb_model3.grid_scores_, lgb_model3.best_params_, lgb_model3.best_score_
Out[196]:
In [197]:
param_set = {
'max_bin':[10, 20, 30, 40, 50],
'subsample_for_bin':[20, 40, 60, 80, 100]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model3 = gsearch.fit(features_train, label_train)
lgb_model3.grid_scores_, lgb_model3.best_params_, lgb_model3.best_score_
Out[197]:
In [199]:
start=datetime.now()
lgb3 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1,
n_estimators=100,
max_bin=50,
subsample_for_bin=20,
min_split_gain=0,
min_child_weight=5,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model3 = lgb_model_fit(lgb3, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.622074
ypred = lgb_model3.predict(features_test)
print(ypred)
In [201]:
ypred3=lgb_model3.predict(features_test)
cm = confusion_matrix(label_test, ypred3)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
# the results here didn't really improve, maybe because of overfitting
In [14]:
start=datetime.now()
lgb3 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1,
n_estimators=100,
max_bin=50,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=5,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model3 = lgb_model_fit(lgb3, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.622074
ypred = lgb_model3.predict(features_test)
print(ypred)
In [15]:
ypred3=lgb_model3.predict(features_test)
cm = confusion_matrix(label_test, ypred3)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
In [16]:
param_set = {
'min_child_weight':[3, 5, 7, 9],
'colsample_bytree':[0.2, 0.4, 0.6, 0.8, 1],
'subsample':[0.2, 0.4, 0.6, 0.8, 1]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=50,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model4 = gsearch.fit(features_train, label_train)
lgb_model4.grid_scores_, lgb_model4.best_params_, lgb_model4.best_score_
Out[16]:
In [19]:
param_set = {
'min_child_weight':[1,2,3,4]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=50,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=5,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model4 = gsearch.fit(features_train, label_train)
lgb_model4.grid_scores_, lgb_model4.best_params_, lgb_model4.best_score_
Out[19]:
In [22]:
start=datetime.now()
lgb4 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1,
n_estimators=100,
max_bin=50,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=3,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model4 = lgb_model_fit(lgb4, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.622074
ypred = lgb_model4.predict(features_test)
print(ypred)
In [23]:
ypred4=lgb_model4.predict(features_test)
cm = confusion_matrix(label_test, ypred4)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
In [24]:
start=datetime.now()
lgb4 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.1,
n_estimators=100,
max_bin=50,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=2,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model4 = lgb_model_fit(lgb4, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.886732
ypred = lgb_model4.predict(features_test)
print(ypred)
In [25]:
ypred4=lgb_model4.predict(features_test)
cm = confusion_matrix(label_test, ypred4)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
# overfit
In [26]:
param_set = {
'learning_rate':[0.01, 0.05, 0.1, 0.2]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=50,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=3,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model5 = gsearch.fit(features_train, label_train)
lgb_model5.grid_scores_, lgb_model5.best_params_, lgb_model5.best_score_
Out[26]:
In [27]:
param_set = {
'learning_rate':[0.2, 0.3, 0.4, 0.5]
}
gsearch = GridSearchCV(estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=30,
max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=50,
subsample_for_bin=5000, min_split_gain=0, min_child_weight=3,
min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7),
param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)
lgb_model5 = gsearch.fit(features_train, label_train)
lgb_model5.grid_scores_, lgb_model5.best_params_, lgb_model5.best_score_
Out[27]:
In [28]:
start=datetime.now()
lgb5 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.2,
n_estimators=100,
max_bin=50,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=3,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=1, reg_lambda=0, seed=410, nthread=7
)
lgb_model5 = lgb_model_fit(lgb5, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.843361
ypred = lgb_model5.predict(features_test)
print(ypred)
In [29]:
ypred5=lgb_model5.predict(features_test)
cm = confusion_matrix(label_test, ypred5)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
In [30]:
# try L2 regularization
start=datetime.now()
lgb6 = LGBMClassifier(
boosting_type='gbdt',
num_leaves=30,
max_depth=5,
learning_rate=0.2,
n_estimators=100,
max_bin=50,
subsample_for_bin=5000,
min_split_gain=0,
min_child_weight=3,
min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=0.6,
reg_alpha=0, reg_lambda=1, seed=410, nthread=7
)
lgb_model6 = lgb_model_fit(lgb6, features_train, label_train)
stop=datetime.now()
execution_time_lgbm = stop-start
print(execution_time_lgbm) # 0:00:00.764053
ypred = lgb_model6.predict(features_test)
print(ypred)
In [31]:
ypred6=lgb_model6.predict(features_test)
cm = confusion_matrix(label_test, ypred6)
TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]
accuracy_lgb = (TP + TN)/(TP+FP+FN+TN)
auc_score_lgb = roc_auc_score(label_test, ypred)
precision_lgb = TP/(TP+FP)
specificity_lgb = TN/(TN+FP)
recall_lgb = TP/(TP+FN)
print("accuracy: ", accuracy_lgb)
print("AUC: ", auc_score_lgb)
print("Precision: ", precision_lgb)
print("Specificity: ", specificity_lgb)
print("Recall: ", recall_lgb)
# L1 works better in this case