In [1]:
import os
import itertools
import joblib
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv(os.path.join("..", "data", "census", "cleaned-census-data.csv"), delimiter=',')
df.head()
In [3]:
features_df = df.drop('>50K', axis='columns')
labels_df = df['>50K']
In [4]:
X = features_df.values
y = labels_df.values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2)
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)
dtest = xgb.DMatrix(X_test, y_test)
In [5]:
hyperparams = {
'eta': 0.5,
'max_depth': 7,
}
print(hyperparams)
hyperparams['objective'] = "binary:logistic"
hyperparams['eval_metric'] = ['error']
num_rounds = 20
eval_list = [(dtrain, 'train'), (dval, 'val')]
In [6]:
bst = xgb.train(hyperparams, dtrain, num_rounds, eval_list)
In [7]:
print(f"Training accuracy: {metrics.accuracy_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing accuracy: {metrics.accuracy_score(dtest.get_label(), bst.predict(dtest).round())}")
In [8]:
print(f"Training F-score: {metrics.f1_score(dtrain.get_label(), bst.predict(dtrain).round())}")
print(f"Testing F-score: {metrics.f1_score(dtest.get_label(), bst.predict(dtest).round())}")
In [9]:
joblib.dump(bst, os.path.join("..", "output", "xgboost.gz"))