This notebook shows how to:
You will need a Google Cloud Platform account and project to run this notebook. Instructions for creating a project can be found here.
In [1]:
import sys
python_version = sys.version_info[0]
In [ ]:
# If you're running on Colab, you'll need to install the What-if Tool package and authenticate
# If you're on Cloud AI Platform Notebooks, you'll need to install XGBoost on the TF instance
def pip_install(module):
if python_version == '2':
!pip install {module} --quiet
else:
!pip3 install {module} --quiet
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if IN_COLAB:
pip_install('witwidget')
from google.colab import auth
auth.authenticate_user()
else:
pip_install('xgboost')
In [0]:
import pandas as pd
import xgboost as xgb
import numpy as np
import collections
import witwidget
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder
In this section we'll:
1
becomes Office of the Comptroller of the Currency (OCC)
In [0]:
# Use a small subset of the data since the original dataset is too big for Colab (2.5GB)
# Data source: https://www.ffiec.gov/hmda/hmdaflat.htm
!gsutil cp gs://mortgage_dataset_files/mortgage-small.csv .
In [0]:
# Set column dtypes for Pandas
COLUMN_NAMES = collections.OrderedDict({
'as_of_year': np.int16,
'agency_code': 'category',
'loan_type': 'category',
'property_type': 'category',
'loan_purpose': 'category',
'occupancy': np.int8,
'loan_amt_thousands': np.float64,
'preapproval': 'category',
'county_code': np.float64,
'applicant_income_thousands': np.float64,
'purchaser_type': 'category',
'hoepa_status': 'category',
'lien_status': 'category',
'population': np.float64,
'ffiec_median_fam_income': np.float64,
'tract_to_msa_income_pct': np.float64,
'num_owner_occupied_units': np.float64,
'num_1_to_4_family_units': np.float64,
'approved': np.int8
})
In [0]:
# Load data into Pandas
data = pd.read_csv(
'mortgage-small.csv',
index_col=False,
dtype=COLUMN_NAMES
)
data = data.dropna()
data = shuffle(data, random_state=2)
data.head()
In [0]:
# Label preprocessing
labels = data['approved'].values
# See the distribution of approved / denied classes (0: denied, 1: approved)
print(data['approved'].value_counts())
In [0]:
data = data.drop(columns=['approved'])
In [0]:
# Convert categorical columns to dummy columns
dummy_columns = list(data.dtypes[data.dtypes == 'category'].index)
data = pd.get_dummies(data, columns=dummy_columns)
In [0]:
# Preview the data
data.head()
In [0]:
# Split the data into train / test sets
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)
In [0]:
# Train the model, this will take a few minutes to run
bst = xgb.XGBClassifier(
objective='reg:logistic'
)
bst.fit(x_train, y_train)
In [0]:
# Get predictions on the test set and print the accuracy score
y_pred = bst.predict(x_test)
acc = accuracy_score(y_test, y_pred.round())
print(acc, '\n')
In [0]:
# Print a confusion matrix
print('Confusion matrix:')
cm = confusion_matrix(y_test, y_pred.round())
cm = cm / cm.astype(np.float).sum(axis=1)
print(cm)
In [0]:
# Save the model so we can deploy it
bst.save_model('model.bst')
In [0]:
# Define some globals - update these to your own project + model names
GCP_PROJECT = 'YOUR_GCP_PROJECT'
MODEL_BUCKET = 'gs://your_storage_bucket'
MODEL_NAME = 'your_model_name' # You'll create this model below
VERSION_NAME = 'v1'
In [0]:
# Copy your model file to Cloud Storage
!gsutil cp ./model.bst $MODEL_BUCKET
In [ ]:
# Configure gcloud to use your project
!gcloud config set project $GCP_PROJECT
In [0]:
# Create a model
!gcloud ai-platform models create $MODEL_NAME
In [0]:
# Create a version, this will take ~2 minutes to deploy
!gcloud ai-platform versions create $VERSION_NAME \
--model=$MODEL_NAME \
--framework='XGBOOST' \
--runtime-version=1.14 \
--origin=$MODEL_BUCKET \
--python-version=3.5 \
--project=$GCP_PROJECT
In [0]:
# Format a subset of the test data to send to the What-if Tool for visualization
# Append ground truth label value to training data
# This is the number of examples you want to display in the What-if Tool
num_wit_examples = 500
test_examples = np.hstack((x_test[:num_wit_examples].values,y_test[:num_wit_examples].reshape(-1,1)))
In [0]:
# Create a What-if Tool visualization, it may take a minute to load
# See the cell below this for exploration ideas
# This prediction adjustment function is needed as this xgboost model's
# prediction returns just a score for the positive class of the binary
# classification, whereas the What-If Tool expects a list of scores for each
# class (in this case, both the negative class and the positive class).
def adjust_prediction(pred):
return [1 - pred, pred]
config_builder = (WitConfigBuilder(test_examples.tolist(), data.columns.tolist() + ['mortgage_status'])
.set_ai_platform_model(GCP_PROJECT, MODEL_NAME, VERSION_NAME, adjust_prediction=adjust_prediction)
.set_target_feature('mortgage_status')
.set_label_vocab(['denied', 'approved']))
WitWidget(config_builder, height=800)
Individual data points: the default graph shows all data points from the test set, colored by their ground truth label (approved or denied)
Binning data: create separate graphs for individual features
Exploring overall performance: Click on the "Performance & Fairness" tab to view overall performance statistics on the model's results on the provided dataset, including confusion matrices, PR curves, and ROC curves.