In [ ]:
## https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
In [3]:
## Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier ## GBM algorithm
from sklearn import cross_validation,metrics
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
train = pd.read_csv('train_modified.csv')
target = 'Disbursed'
IDcol ='ID'
train['Disbursed'] = train.Disbursed.astype(int)
In [4]:
### Before proceeding further, lets define a function which will help us create GBM models and perform cross-validation.
def modelfit(alg, dtrain,dtest, predictors, performCV = True, printFeatureImportance=True, cv_folds= 5):
#Fit the algorithm on the data
alg.fit(dtrain[predictors],dtrain['Disbursed'])
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
## Perform cross validation
if performCV:
cv_score = cross_validation.cross_val_score(alg, dtrain[predictors],dtrain['Disbursed'],
cv = cv_folds,scoring='roc_auc')
#Print model report:
print("\nModel Report")
print("Accuracy : {:.4g}".format(metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)))
print("AUC Score (Train): {:.4f}".format( metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)))
if performCV:
#print( "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g",
# (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
print( "CV Score : Mean - {:.7g} | Std - {:.7g} | Min - {:.7g} | Max - {:.7g}".format(
np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [ ]:
In [9]:
## def modelfit(alg, dtrain, predictors, performCV = True, printFeatureImportance=True, cv_folds= 5):
test = pd.read_csv('test_modified.csv')
##Choose all predictors except target & IDcols
predictors = [ x for x in train.columns if x not in [target,IDcol]]
gbm0 = GradientBoostingClassifier(random_state=10)
modelfit(gbm0, train, test, predictors) #,printOOB=False)
In [ ]:
#train['Disbursed'].value_counts()
GBM Models:
There 2 types of parameters here:
Tree-specific parameters
min_samples_split
min_samples_leaf max_depth min_leaf_nodes max_features loss function Boosting specific paramters n_estimators learning_rate subsample Approach for tackling the problem Decide a relatively higher value for learning rate and tune the number of estimators requried for that. Tune the tree specific parameters for that learning rate Tune subsample Lower learning rate as much as possible computationally and increase the number of estimators accordingly. Step 1- Find the number of estimators for a high learning rate We will use the following benchmarks for parameters: min_samples_split = 500 : ~0.5-1% of total values. Since this is imbalanced class problem, we'll take small value min_samples_leaf = 50 : Just using for preventing overfitting. will be tuned later. max_depth = 8 : since high number of observations and predictors, choose relatively high value max_features = 'sqrt' : general thumbrule to start with subsample = 0.8 : typically used value (will be tuned later) 0.1 is assumed to be a good learning rate to start with. Let's try to find the optimum number of estimators requried for this.
In [10]:
#Choose all predictors except target & IDcols
predictors = [ x for x in train.columns if x not in [ target, IDcol]]
param_test1 = { 'n_estimators': range(20,81,10)}
estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,
min_samples_leaf=50,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10)
gsearch1 = GridSearchCV(estimator, param_grid = param_test1, scoring='roc_auc', iid = False, cv =5 )
gsearch1.fit(train[predictors], train[target])
Out[10]:
In [11]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[11]:
In [12]:
##Grid seach on subsample and max_features
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate = 0.1, n_estimators =60, max_features='sqrt',
subsample =0.8, random_state=10),
param_grid = param_test2,scoring = 'roc_auc',iid = False, cv =5 )
gsearch2.fit(train[predictors],train[target])
Out[12]:
In [13]:
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[13]:
In [15]:
param_test3 = {'min_samples_split': range(800,2100,200),'min_samples_leaf':range(30,71,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators =60,
max_depth=9,max_features='sqrt', subsample = 0.8,
random_state=10),
param_grid= param_test3,scoring= 'roc_auc',iid=False,cv=5)
gsearch3.fit(train[predictors], train[target])
Out[15]:
In [17]:
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
Out[17]:
In [19]:
gsearch3.best_estimator_
Out[19]:
In [20]:
modelfit(gsearch3.best_estimator_, train,test,predictors)
In [22]:
## Tune max_features:
param_test4 = {'max_features': range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 60, max_depth=9,
min_samples_split=1200,min_samples_leaf=60,
subsample=0.8,random_state=10),
param_grid =param_test4,scoring='roc_auc',iid=False,cv=5)
gsearch4.fit(train[predictors],train[target])
Out[22]:
In [23]:
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[23]:
In [24]:
## Step3- Tune Subsample and Lower Learning Rate
#Grid seach on subsample and max_features
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=9,
min_samples_split=1200, min_samples_leaf=60, subsample=0.8, random_state=10, max_features=7),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
Out[24]:
In [25]:
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
Out[25]:
In [30]:
#### With all tuned lets try reducing the learning rate and proportionally
#### increasing the number of estimators to get more robust results:
#Choose all predictors except target & IDcols
predictors = [ x for x in train.columns if x not in [target, IDcol]]
gbm_tuned_1 = GradientBoostingClassifier(learning_rate= 0.05, n_estimators = 120, max_depth=9, min_samples_split=1200,
min_samples_leaf=60,subsample=0.85,random_state=10,max_features=7)
modelfit(gbm_tuned_1, train,test,predictors)
In [33]:
## 1/10th learning rate
predictors = [x for x in train.columns if x not in [target,IDcol]]
gbm_tuned_2 = GradientBoostingClassifier(learning_rate = .01, n_estimators = 600, max_depth=9, min_samples_split=1200,
min_samples_leaf=60, subsample=0.85,random_state=10,max_features=7)
modelfit(gbm_tuned_2,train,test,predictors)
In [34]:
### 1/50th learning rate
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
gbm_tuned_3 = GradientBoostingClassifier(learning_rate=0.005, n_estimators=1200,max_depth=9, min_samples_split=1200,
min_samples_leaf=60, subsample=0.85, random_state=10, max_features=7,
warm_start=True)
modelfit(gbm_tuned_3, train, test, predictors, performCV=False)
In [ ]: