Random Forest with Grid Search (XGBoost)


In [1]:
import os
import itertools

import joblib

import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import metrics

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

Data Loading


In [2]:
df = pd.read_csv(os.path.join("..", "data", "census", "cleaned-census-data.csv"), delimiter=',')

df.head()

In [3]:
features_df = df.drop('>50K', axis='columns')
labels_df = df['>50K']

Data Splitting


In [4]:
X = features_df.values
y = labels_df.values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2)

dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)
dtest = xgb.DMatrix(X_test, y_test)

Training


In [5]:
hyperparams = {
    'eta': 0.5,
    'max_depth': 7,
}
print(hyperparams)
hyperparams['objective'] = "binary:logistic"
hyperparams['eval_metric'] = ['error']

num_rounds = 20
eval_list = [(dtrain, 'train'), (dval, 'val')]

In [6]:
bst = xgb.train(hyperparams, dtrain, num_rounds, eval_list)

Testing


In [7]:
print(f"Training accuracy: {metrics.accuracy_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing accuracy: {metrics.accuracy_score(dtest.get_label(), bst.predict(dtest).round())}")

In [8]:
print(f"Training F-score: {metrics.f1_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing F-score: {metrics.f1_score(dtest.get_label(), bst.predict(dtest).round())}")

Save Model


In [9]:
joblib.dump(bst, os.path.join("..", "output", "xgboost.gz"))