In [0]:
import itertools
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import xgboost as xgb
from tensorflow import keras
from tensorflow.keras import Sequential
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from google.cloud import bigquery
To demonstrate downsampling, we'll be using this synthetic fraud detection dataset from Kaggle. We've made a version of it available in a public Cloud Storage bucket.
In [2]:
# Download the data and preview
!gsutil cp gs://ml-design-patterns/fraud_data_kaggle.csv .
In [3]:
fraud_data = pd.read_csv('fraud_data_kaggle.csv')
fraud_data.head()
Out[3]:
In [0]:
# Drop a few columns we won't use for this demo
fraud_data = fraud_data.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
fraud_data = pd.get_dummies(fraud_data)
In [0]:
# Split into separate dataframes
fraud = fraud_data[fraud_data['isFraud'] == 1]
not_fraud = fraud_data[fraud_data['isFraud'] == 0]
# Take a random sample of non-fraud data
# The .005 frac will give us around an 80/20 split of not-fraud/fraud samples
# You can experiment with this value
not_fraud_sample = not_fraud.sample(random_state=2, frac=.005)
In [0]:
# Put the data back together and shuffle
fraud_data = pd.concat([not_fraud_sample,fraud])
fraud_data = shuffle(fraud_data, random_state=2)
In [7]:
# Look at our data balance after downsampling
fraud_data['isFraud'].value_counts()
Out[7]:
In [0]:
train_test_split = int(len(fraud_data) * .8)
train_data = fraud_data[:train_test_split]
test_data = fraud_data[train_test_split:]
train_labels = train_data.pop('isFraud')
test_labels = test_data.pop('isFraud')
In [0]:
model = xgb.XGBRegressor(
objective='reg:linear'
)
In [10]:
model.fit(train_data.values, train_labels)
Out[10]:
In [0]:
# Get some test predictions
y_pred = model.predict(test_data.values)
In [0]:
# To build a confusion matrix using the scikit utility, we'll need the values as ints
y_pred_formatted = []
for i in y_pred:
y_pred_formatted.append(int(round(i)))
In [13]:
cm = confusion_matrix(test_labels.values, y_pred_formatted)
print(cm)
In [0]:
# This is from the sklearn docs
# https://scikit-learn.org/0.18/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], 3)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [15]:
# With downsampling, our model's accuracy on fraud is almost as good as non-fraud examples
# You can compare this by training a model on the full dataset if you'd like (it'll take a long time to train given the size)
classes = ['not fraud', 'fraud']
plot_confusion_matrix(cm, classes, normalize=True)
In [0]:
# To access BigQuery, you'll need to authenticate to your Cloud account
from google.colab import auth
auth.authenticate_user()
We'll take all of the fraud examples from this dataset, and a subset of non-fraud. Then we'll shuffle and combine and look at the number of examples we have for each class.
In [0]:
%%bigquery fraud_df --project sara-cloud-ml
SELECT
*
FROM
`bigquery-public-data.ml_datasets.ulb_fraud_detection`
WHERE Class = 1
In [0]:
# This query will take a a minute to run
%%bigquery nonfraud_df --project sara-cloud-ml
SELECT
*
FROM
`bigquery-public-data.ml_datasets.ulb_fraud_detection`
WHERE Class = 0
AND RAND() < 0.05
In [19]:
bq_fraud_data = pd.concat([fraud_df, nonfraud_df])
bq_fraud_data.sort_values(by=['Time'])
# bq_fraud_data = shuffle(bq_fraud_data, random_state=22)
Out[19]:
In [0]:
# Scale time and amount values
time_scaler = MinMaxScaler()
amt_scaler = MinMaxScaler()
bq_fraud_data['Time'] = time_scaler.fit_transform(bq_fraud_data['Time'].values.reshape(-1,1))
bq_fraud_data['Amount'] = amt_scaler.fit_transform(bq_fraud_data['Amount'].values.reshape(-1,1))
In [21]:
# See data balance
bq_fraud_data['Class'].value_counts()
Out[21]:
In [0]:
train_test_split = int(len(bq_fraud_data) * .8)
train_data = bq_fraud_data[:train_test_split]
test_data = bq_fraud_data[train_test_split:]
train_labels = train_data.pop('Class')
test_labels = test_data.pop('Class')
In [0]:
# Create a tf dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_data.values, train_labels))
train_dataset = train_dataset.shuffle(len(train_data)).batch(1024)
test_dataset = tf.data.Dataset.from_tensor_slices((test_data.values, test_labels))
test_dataset = test_dataset.shuffle(len(test_data)).batch(1)
Now let's try with weighted classes and add a bias initializer to our output layer. First, calculate the class weights.
In [0]:
# Get number of examples for each class from the training set
num_minority = train_labels.value_counts()[1]
num_majority = train_labels.value_counts()[0]
In [25]:
minority_class_weight = 1 / (num_minority / len(train_data)) / 2
majority_class_weight = 1 / (num_majority / len(train_data)) / 2
# Pass the weights to Keras in a dict
# The key is the index of each class
keras_class_weights = {0: majority_class_weight, 1: minority_class_weight}
print(keras_class_weights)
# Calculate output bias
output_bias = math.log(num_minority / num_majority)
print(output_bias)
In [0]:
fraud_model = keras.Sequential([
keras.layers.Dense(16, input_shape=(len(train_data.iloc[0]),), activation='relu'),
keras.layers.Dropout(0.25),
keras.layers.Dense(16, activation='relu'),
keras.layers.Dense(1, activation='sigmoid', bias_initializer=tf.keras.initializers.Constant(output_bias))
])
In [0]:
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='roc_auc'),
]
In [0]:
fraud_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
In [0]:
fraud_model.fit(train_dataset, validation_data=test_dataset, epochs=10, class_weight=keras_class_weights)
In [124]:
# This will take about a minute to run
%%bigquery --project sara-cloud-ml
CREATE OR REPLACE MODEL
`sara-cloud-ml.natality.baby_weight_clusters` OPTIONS(model_type='kmeans',
num_clusters=4) AS
SELECT
weight_pounds,
mother_age,
gestation_weeks
FROM
`bigquery-public-data.samples.natality`
LIMIT 10000
Out[124]:
First, let's look at the cluster prediction results for an "average" example from our dataset.
In [0]:
%%bigquery average_pred --project sara-cloud-ml
SELECT
*
FROM
ML.PREDICT (MODEL `sara-cloud-ml.natality.baby_weight_clusters`,
(
SELECT
7.0 as weight_pounds,
28 as mother_age,
40 as gestation_weeks
)
)
In [136]:
average_pred
Out[136]:
Here, it's fairly obvious that this datapoint should be put in cluster 1, given the short distance from that cluster.
In [133]:
# Print the resulting cluster distances
df['NEAREST_CENTROIDS_DISTANCE'].iloc[0]
Out[133]:
Let's compare this with a cluster prediction for an outlier baby weight.
In [0]:
%%bigquery outlier_pred --project sara-cloud-ml
SELECT
*
FROM
ML.PREDICT (MODEL `sara-cloud-ml.natality.baby_weight_clusters`,
(
SELECT
3.0 as weight_pounds,
20 as mother_age,
27 as gestation_weeks
)
)
In [138]:
outlier_pred
Out[138]:
Here there's a high distance from each cluster, which we can use to conclude that this might be an anomaly.
In [139]:
outlier_pred['NEAREST_CENTROIDS_DISTANCE'].iloc[0]
Out[139]:
Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License