# Imports
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier
# Import and preprocess data
train_set = pd.read_csv('', header = None)
test_set = pd.read_csv('',
skiprows = 1, header = None) # Make sure to skip a row for the test set
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
train_set.columns = col_labels
test_set.columns = col_labels
train_nomissing = train_set.replace(' ?', np.nan).dropna()
test_nomissing = test_set.replace(' ?', np.nan).dropna()
train_nomissing['wage_class'] = train_nomissing.wage_class.replace({' <=50K.': ' <=50K', ' >50K.':' >50K'})
test_nomissing['wage_class'] = test_nomissing.wage_class.replace({' <=50K.': ' <=50K', ' >50K.':' >50K'})
combined_set = pd.concat([train_nomissing, test_nomissing], axis = 0) # Stacks them vertically
for feature in combined_set.columns: # Loop through all columns in the dataframe
if combined_set[feature].dtype == 'object': # Only apply for columns with categorical strings
combined_set[feature] = pd.Categorical(combined_set[feature]).codes # Replace strings with an integer
final_train = combined_set[:train_nomissing.shape[0]] # Up to the last initial training set row
final_test = combined_set[train_nomissing.shape[0]:] # Past the last initial training set row
y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')
# Set Up parameter tuning ranges.
cv_params = {'max_depth': [3,5,7], #3-10
'min_child_weight': [1,3,5]}
#colsample_bytree #0.5-1.0
#lambda = 1
#alpha #regularization term (useful for high dimensionality)
# scale_pos_weight [default=1] counters class imbalance
#Set up initial parameters
ind_params = {'learning_rate': 0.1, #0.01 - 0.2
'n_estimators': 1000,
'subsample': 0.8, #0.5-1.0
'colsample_bytree': 0.8,
'objective': 'binary:logistic'} #'mutli:softmax' (will reqwuire setting num_class= ) ,multi:softprob
#eval_metric rmse – root mean square error
# mae – mean absolute error
# logloss – negative log-likelihood
# error – Binary classification error rate (0.5 threshold)
# merror – Multiclass classification error rate
# mlogloss – Multiclass logloss
# auc: Area under the curve
# Create
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
scoring = 'accuracy',
cv = 5,
n_jobs = -1)
# Optimize for accuracy since that is the metric used in the Adult Data Set notation
%time, y_train)
