In [0]:
#@title Copyright 2020 Google LLC. Double-click here for license information.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
So far, you've only created regression models. That is, you created models that produced floating-point predictions, such as, "houses in this neighborhood costs N thousand dollars." In this Colab, you'll create and evaluate a binary classification model. That is, you'll create a model that answers a binary question. In this exercise, the binary question will be, "Are houses in this neighborhood above a certain price?"
After doing this Colab, you'll know how to:
Like several of the previous Colabs, this Colab uses the California Housing Dataset.
The following hidden code cell ensures that the Colab will run on TensorFlow 2.X.
In [0]:
#@title Run on TensorFlow 2.x
%tensorflow_version 2.x
In [0]:
#@title Load the imports
# from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from matplotlib import pyplot as plt
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
# tf.keras.backend.set_floatx('float32')
print("Ran the import statements.")
In [0]:
train_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")
test_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv")
train_df = train_df.reindex(np.random.permutation(train_df.index)) # shuffle the training set
Unlike some of the previous Colabs, the preceding code cell did not scale the label (median_house_value
). The following section ("Normalize values") provides an alternative approach.
When creating a model with multiple features, the values of each feature should cover roughly the same range. For example, if one feature's range spans 500 to 100,000 and another feature's range spans 2 to 12, then the model will be difficult or impossible to train. Therefore, you should normalize features in a multi-feature model.
The following code cell normalizes datasets by converting each raw value (including the label) to its Z-score. A Z-score is the number of standard deviations from the mean for a particular raw value. For example, consider a feature having the following characteristics:
The raw value 75 would have a Z-score of +1.5:
Z-score = (75 - 60) / 10 = +1.5
The raw value 38 would have a Z-score of -2.2:
Z-score = (38 - 60) / 10 = -2.2
In [0]:
# Calculate the Z-scores of each column in the training set and
# write those Z-scores into a new pandas DataFrame named train_df_norm.
train_df_mean = train_df.mean()
train_df_std = train_df.std()
train_df_norm = (train_df - train_df_mean)/train_df_std
# Examine some of the values of the normalized training set. Notice that most
# Z-scores fall between -2 and +2.
train_df_norm.head()
In [0]:
# Calculate the Z-scores of each column in the test set and
# write those Z-scores into a new pandas DataFrame named test_df_norm.
test_df_mean = test_df.mean()
test_df_std = test_df.std()
test_df_norm = (test_df - test_df_mean)/test_df_std
In classification problems, the label for every example must be either 0 or 1. Unfortunately, the natural label in the California Housing Dataset, median_house_value
, contains floating-point values like 80,100 or 85,700 rather than 0s and 1s, while the normalized version of median_house_values
contains floating-point values primarily between -3 and +3.
Your task is to create a new column named median_house_value_is_high
in both the training set and the test set . If the median_house_value
is higher than a certain arbitrary value (defined by threshold
), then set median_house_value_is_high
to 1. Otherwise, set median_house_value_is_high
to 0.
Hint: The cells in the median_house_value_is_high
column must each hold 1
and 0
, not True
and False
. To convert True
and False
to 1
and 0
, call the pandas DataFrame function astype(float)
.
In [0]:
threshold = 265000 # This is the 75th percentile for median house values.
train_df_norm["median_house_value_is_high"] = ? Your code here
test_df_norm["median_house_value_is_high"] = ? Your code here
# Print out a few example cells from the beginning and
# middle of the training set, just to make sure that
# your code created only 0s and 1s in the newly created
# median_house_value_is_high column
train_df_norm["median_house_value_is_high"].head(8000)
In [0]:
#@title Double-click for possible solutions.
# We arbitrarily set the threshold to 265,000, which is
# the 75th percentile for median house values. Every neighborhood
# with a median house price above 265,000 will be labeled 1,
# and all other neighborhoods will be labeled 0.
threshold = 265000
train_df_norm["median_house_value_is_high"] = (train_df["median_house_value"] > threshold).astype(float)
test_df_norm["median_house_value_is_high"] = (test_df["median_house_value"] > threshold).astype(float)
train_df_norm["median_house_value_is_high"].head(8000)
# Alternatively, instead of picking the threshold
# based on raw house values, you can work with Z-scores.
# For example, the following possible solution uses a Z-score
# of +1.0 as the threshold, meaning that no more
# than 16% of the values in median_house_value_is_high
# will be labeled 1.
# threshold_in_Z = 1.0
# train_df_norm["median_house_value_is_high"] = (train_df_norm["median_house_value"] > threshold_in_Z).astype(float)
# test_df_norm["median_house_value_is_high"] = (test_df_norm["median_house_value"] > threshold_in_Z).astype(float)
This code cell specifies the features that you'll ultimately train the model on and how each of those features will be represented. The transformations (collected in feature_layer
) don't actually get applied until you pass a DataFrame to it, which will happen when we train the model.
In [0]:
# Create an empty list that will eventually hold all created feature columns.
feature_columns = []
# Create a numerical feature column to represent median_income.
median_income = tf.feature_column.numeric_column("median_income")
feature_columns.append(median_income)
# Create a numerical feature column to represent total_rooms.
tr = tf.feature_column.numeric_column("total_rooms")
feature_columns.append(tr)
# Convert the list of feature columns into a layer that will later be fed into
# the model.
feature_layer = layers.DenseFeatures(feature_columns)
# Print the first 3 and last 3 rows of the feature_layer's output when applied
# to train_df_norm:
feature_layer(dict(train_df_norm))
The following code cell defines two functions:
create_model(my_learning_rate, feature_layer, my_metrics)
, which defines the model's
topography.train_model(model, dataset, epochs, label_name, batch_size, shuffle)
, uses input features and labels to train the model.Prior exercises used ReLU as the activation function. By contrast, this exercise uses sigmoid as the activation function.
In [0]:
#@title Define the functions that create and train a model.
def create_model(my_learning_rate, feature_layer, my_metrics):
"""Create and compile a simple classification model."""
# Most simple tf.keras models are sequential.
model = tf.keras.models.Sequential()
# Add the feature layer (the list of features and how they are represented)
# to the model.
model.add(feature_layer)
# Funnel the regression value through a sigmoid function.
model.add(tf.keras.layers.Dense(units=1, input_shape=(1,),
activation=tf.sigmoid),)
# Call the compile method to construct the layers into a model that
# TensorFlow can execute. Notice that we're using a different loss
# function for classification than for regression.
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=my_metrics)
return model
def train_model(model, dataset, epochs, label_name,
batch_size=None, shuffle=True):
"""Feed a dataset into the model in order to train it."""
# The x parameter of tf.keras.Model.fit can be a list of arrays, where
# each array contains the data for one feature. Here, we're passing
# every column in the dataset. Note that the feature_layer will filter
# away most of those columns, leaving only the desired columns and their
# representations as features.
features = {name:np.array(value) for name, value in dataset.items()}
label = np.array(features.pop(label_name))
history = model.fit(x=features, y=label, batch_size=batch_size,
epochs=epochs, shuffle=shuffle)
# The list of epochs is stored separately from the rest of history.
epochs = history.epoch
# Isolate the classification metric for each epoch.
hist = pd.DataFrame(history.history)
return epochs, hist
print("Defined the create_model and train_model functions.")
The following matplotlib function plots one or more curves, showing how various classification metrics change with each epoch.
In [0]:
#@title Define the plotting function.
def plot_curve(epochs, hist, list_of_metrics):
"""Plot a curve of one or more classification metrics vs. epoch."""
# list_of_metrics should be one of the names shown in:
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#define_the_model_and_metrics
plt.figure()
plt.xlabel("Epoch")
plt.ylabel("Value")
for m in list_of_metrics:
x = hist[m]
plt.plot(epochs[1:], x[1:], label=m)
plt.legend()
print("Defined the plot_curve function.")
In [0]:
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
label_name = "median_house_value_is_high"
classification_threshold = 0.35
# Establish the metrics the model will measure.
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy',
threshold=classification_threshold),
]
# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)
# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs,
label_name, batch_size)
# Plot a graph of the metric(s) vs. epochs.
list_of_metrics_to_plot = ['accuracy']
plot_curve(epochs, hist, list_of_metrics_to_plot)
Accuracy should gradually improve during training (until it can improve no more).
In [0]:
features = {name:np.array(value) for name, value in test_df_norm.items()}
label = np.array(features.pop(label_name))
my_model.evaluate(x = features, y = label, batch_size=batch_size)
In [0]:
#@title Double-click for a possible answer to Task 2.
# A perfect model would make 100% accurate predictions.
# Our model makes 80% accurate predictions. 80% sounds
# good, but note that a model that always guesses
# "median_house_value_is_high is False" would be 75%
# accurate.
Relying solely on accuracy, particularly for a class-imbalanced data set (like ours), can be a poor way to judge a classification model. Modify the code in the following code cell to enable the model to measure not only accuracy but also precision and recall. We have added accuracy and precision; your task is to add recall. See the TensorFlow Reference for details.
In [0]:
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
classification_threshold = 0.35
label_name = "median_house_value_is_high"
# Modify the following definition of METRICS to generate
# not only accuracy and precision, but also recall:
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy',
threshold=classification_threshold),
tf.keras.metrics.Precision(thresholds=classification_threshold,
name='precision'
),
? # write code here
]
# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)
# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs,
label_name, batch_size)
# Plot metrics vs. epochs
list_of_metrics_to_plot = ['accuracy', 'precision', 'recall']
plot_curve(epochs, hist, list_of_metrics_to_plot)
In [0]:
#@title Double-click to view the solution for Task 3.
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
classification_threshold = 0.35
label_name = "median_house_value_is_high"
# Here is the updated definition of METRICS:
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy',
threshold=classification_threshold),
tf.keras.metrics.Precision(thresholds=classification_threshold,
name='precision'
),
tf.keras.metrics.Recall(thresholds=classification_threshold,
name="recall"),
]
# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)
# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs,
label_name, batch_size)
# Plot metrics vs. epochs
list_of_metrics_to_plot = ['accuracy', "precision", "recall"]
plot_curve(epochs, hist, list_of_metrics_to_plot)
# The new graphs suggest that precision and recall are
# somewhat in conflict. That is, improvements to one of
# those metrics may hurt the other metric.
In [0]:
#@title Double-click to view the solution for Task 4.
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
classification_threshold = 0.52
label_name = "median_house_value_is_high"
# Here is the updated definition of METRICS:
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy',
threshold=classification_threshold),
tf.keras.metrics.Precision(thresholds=classification_threshold,
name='precision'
),
tf.keras.metrics.Recall(thresholds=classification_threshold,
name="recall"),
]
# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)
# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs,
label_name, batch_size)
# Plot metrics vs. epochs
list_of_metrics_to_plot = ['accuracy', "precision", "recall"]
plot_curve(epochs, hist, list_of_metrics_to_plot)
# A `classification_threshold` of slightly over 0.5
# appears to produce the highest accuracy (about 83%).
# Raising the `classification_threshold` to 0.9 drops
# accuracy by about 5%. Lowering the
# `classification_threshold` to 0.3 drops accuracy by
# about 3%.
In [0]:
#@title Double-click to view the solution for Task 5.
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 20
batch_size = 100
label_name = "median_house_value_is_high"
# AUC is a reasonable "summary" metric for
# classification models.
# Here is the updated definition of METRICS to
# measure AUC:
METRICS = [
tf.keras.metrics.AUC(num_thresholds=100, name='auc'),
]
# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)
# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs,
label_name, batch_size)
# Plot metrics vs. epochs
list_of_metrics_to_plot = ['auc']
plot_curve(epochs, hist, list_of_metrics_to_plot)