In [10]:
# Imports
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier

In [11]:
# Import and preprocess data
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                      skiprows = 1, header = None) # Make sure to skip a row for the test set

col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

train_nomissing = train_set.replace(' ?', np.nan).dropna()
test_nomissing = test_set.replace(' ?', np.nan).dropna()
train_nomissing['wage_class'] = train_nomissing.wage_class.replace({' <=50K.': ' <=50K', ' >50K.':' >50K'})
test_nomissing['wage_class'] = test_nomissing.wage_class.replace({' <=50K.': ' <=50K', ' >50K.':' >50K'})

combined_set = pd.concat([train_nomissing, test_nomissing], axis = 0) # Stacks them vertically
for feature in combined_set.columns: # Loop through all columns in the dataframe
    if combined_set[feature].dtype == 'object': # Only apply for columns with categorical strings
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes # Replace strings with an integer

final_train = combined_set[:train_nomissing.shape[0]] # Up to the last initial training set row
final_test = combined_set[train_nomissing.shape[0]:] # Past the last initial training set row


y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')

In [ ]:
# Set Up parameter tuning ranges.
cv_params = {'max_depth': [3,5,7], #3-10
             'min_child_weight': [1,3,5]}
            #colsample_bytree #0.5-1.0
            #lambda = 1
            #alpha #regularization term (useful for high dimensionality)
            # scale_pos_weight [default=1] counters class imbalance
#Set up initial parameters
ind_params = {'learning_rate': 0.1, #0.01 - 0.2
              'n_estimators': 1000, 
              'seed':0, 
              'subsample': 0.8, #0.5-1.0
              'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'} #'mutli:softmax' (will reqwuire setting num_class= ) ,multi:softprob
                #eval_metric rmse – root mean square error
                    # mae – mean absolute error
                    # logloss – negative log-likelihood
                    # error – Binary classification error rate (0.5 threshold)
                    # merror – Multiclass classification error rate
                    # mlogloss – Multiclass logloss
                    # auc: Area under the curve
# Create
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', 
                             cv = 5, 
                             n_jobs = -1) 
# Optimize for accuracy since that is the metric used in the Adult Data Set notation
  1. Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.
  2. Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
  3. Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
  4. Lower the learning rate and decide the optimal parameters .

In [ ]:
%time
optimized_GBM.fit(final_train, y_train)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs

In [ ]:
GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_child_weight': [1, 3, 5], 'max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [ ]:
optimized_GBM.grid_scores_

In [ ]: