In [ ]:
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
This notebook uses the Census Income Data Set to demonstrate how to train a model and generate local predictions using XGBoost.
The Census Income Data Set that this sample uses for training is provided by the UC Irvine Machine Learning Repository.
In [ ]:
%pip install xgboost
In [ ]:
import datetime
import os
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion, make_pipeline
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
We can simply download the dataset from UC Irvine Machine Learning Repository to our local machine:
In [ ]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
In [ ]:
census_data_filename = './adult.data'
# These are the column labels from the census data files
COLUMNS = (
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income-level'
)
# Load the training census dataset
with open(census_data_filename, 'r') as train_data:
raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
Now, let's take a look at the data to have a better understanding of it:
In [ ]:
raw_training_data.head()
First, let's separate the features and the target and convert them to numpy objects:
In [ ]:
raw_features = raw_training_data.drop('income-level', axis=1).values
# Create training labels list
train_labels = (raw_training_data['income-level'] == ' >50K').values
The features are a combination of both numerical and categorical values. As a part of data preparation before we can feed the data to the modell, we will need to convert the categorical features to numerical. We will use scikit-learn libraries to prepare the data.
scikit-learn has an amazing API to create and train a pipeline to preprocess the data before feeding to the model. We will use a custom pipeline in this notebook to prepare the data for XGBoost:
In [ ]:
class PositionalSelector(BaseEstimator, TransformerMixin):
def __init__(self, positions):
self.positions = positions
def fit(self, X, y=None):
return self
def transform(self, X):
return np.array(X)[:, self.positions]
class StripString(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
strip = np.vectorize(str.strip)
return strip(np.array(X))
class SimpleOneHotEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.values = []
for c in range(X.shape[1]):
Y = X[:, c]
values = {v: i for i, v in enumerate(np.unique(Y))}
self.values.append(values)
return self
def transform(self, X):
X = np.array(X)
matrices = []
for c in range(X.shape[1]):
Y = X[:, c]
matrix = np.zeros(shape=(len(Y), len(self.values[c])), dtype=np.int8)
for i, x in enumerate(Y):
if x in self.values[c]:
matrix[i][self.values[c][x]] = 1
matrices.append(matrix)
res = np.concatenate(matrices, axis=1)
return res
To simplify things a little, we create a pipeline object that only uses the following features:
Now we can create a pipeline object and train it to process our data
In [ ]:
# Categorical features: age and hours-per-week
# Numerical features: workclass, marital-status, and relationship
numerical_indices = [0, 12] # age-num, and hours-per-week
categorical_indices = [1, 3, 5, 7] # workclass, education, marital-status, and relationship
p1 = make_pipeline(PositionalSelector(numerical_indices),
StripString(),
SimpleOneHotEncoder())
p2 = make_pipeline(PositionalSelector(categorical_indices),
StandardScaler())
pipeline = FeatureUnion([
('numericals', p1),
('categoricals', p2),
])
train_features = pipeline.fit_transform(raw_features)
Our dataset is ready for training the model now:
In [ ]:
# train the model
model = xgb.XGBClassifier(max_depth=4)
model.fit(train_features, train_labels)
Once we train the model, we can simply just save it:
In [ ]:
# save the mode
model.save_model('model.bst')
In [ ]:
instances = [[
42, ' State-gov', 77516, ' Bachelors', 13, ' Never-married',
' Adm-clerical', ' Not-in-family', ' White', ' Male', 2174, 0, 40,
' United-States'
],
[
50, ' Self-emp-not-inc', 83311, ' Bachelors', 13,
' Married-civ-spouse', ' Exec-managerial', ' Husband',
' White', ' Male', 0, 0, 10, ' United-States'
]]
First, we need to preprocess the instances:
In [ ]:
processed_instances = pipeline.transform(instances)
Then we'll pass the processed data to the model for classification:
In [ ]:
model.predict(processed_instances)