Learning Objectives:
Install latest 2.x.x
release for tensorflow
In [ ]:
!pip install tensorflow==2.0.0-beta1
In [ ]:
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import logging
from packaging import version
from IPython.display import display
In [ ]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
logging.getLogger('tensorflow').disabled = True
In [ ]:
import tensorflow as tf
%load_ext tensorboard
In [ ]:
from datetime import datetime
import io
logging.getLogger('tensorboard').disabled = True
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe
We'll set up our plot_to_image
function to convert the matplotlib plot specified by figure to a PNG image
In [ ]:
def plot_to_image(figure):
"""Converts the matplotlib plot specified by 'figure' to a PNG image and
returns it. The supplied figure is closed and inaccessible after this call."""
# Save the plot to a PNG in memory.
buf = io.BytesIO()
plt.savefig(buf, format='png')
# Closing the figure prevents it from being displayed directly inside
# the notebook.
plt.close(figure)
buf.seek(0)
# Convert PNG buffer to TF image
image = tf.image.decode_png(buf.getvalue(), channels=4)
# Add the batch dimension
image = tf.expand_dims(image, 0)
return image
Next, we'll define the function for model training
In [ ]:
def fit_model(learning_rate,
steps_per_epoch,
batch_size,
input_feature):
"""Trains a linear regression model of one feature.
Args:
learning_rate: A `float`, the learning rate.
steps_per_epoch: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
input_feature: A `string` specifying a column from `california_housing_dataframe`
to use as input feature.
Returns:
A Pandas `DataFrame` containing targets and the corresponding predictions done
after training the model.
"""
epochs = 10
features = california_housing_dataframe[[input_feature]].values
label = "median_house_value"
labels = california_housing_dataframe[label].values
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(1, activation='linear', kernel_initializer='zeros')
])
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=5.0),
loss='mse',
metrics=[tf.keras.metrics.RootMeanSquaredError()])
sample = california_housing_dataframe.sample(n=300)
logdir = "logs/synthetic_features_and_outliers/plots" + datetime.now().strftime("%Y%m%d-%H%M%S")
scalars_logdir = "logs/synthetic_features_and_outliers/scalars" + datetime.now().strftime("%Y%m%d-%H%M%S")
file_writer = tf.summary.create_file_writer(logdir)
# Set up to plot the state of our model's line each epoch.
def create_plt_params(feature, label, epochs=10):
colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, epochs)]
return (colors,
(sample[feature].min(), sample[feature].max()),
(0, sample[label].max()))
def create_figure(feature, label, epochs=10):
figure = plt.figure(figsize=(15, 6))
plt.title("Learned Line by Epoch")
plt.ylabel(label)
plt.xlabel(feature)
plt.scatter(sample[feature], sample[label])
return figure
colors, x_min_max, y_min_max = create_plt_params(input_feature, label, epochs)
def log(epoch, logs):
root_mean_squared_error = logs["root_mean_squared_error"]
print(" epoch %02d : %0.2f" % (epoch, root_mean_squared_error))
weight, bias = [x.flatten()[0] for x in model.layers[0].get_weights()]
# Apply some math to ensure that the data and line are plotted neatly.
y_extents = np.array(y_min_max)
x_extents = (y_extents - bias) / weight
x_extents = np.maximum(np.minimum(x_extents,
x_min_max[1]),
x_min_max[0])
y_extents = weight * x_extents + bias
figure = create_figure(input_feature, label, epochs)
plt.plot(x_extents, y_extents, color=colors[epoch])
with file_writer.as_default():
tf.summary.image("Learned Line by Epoch",
plot_to_image(figure),
step=epoch)
model_callback = tf.keras.callbacks.LambdaCallback(
on_epoch_end=lambda epoch, logs: log(epoch, logs))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=scalars_logdir,
update_freq='epoch')
print("Train model...")
print("RMSE (on training data):")
history = model.fit(features,
labels,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
batch_size=batch_size,
callbacks=[model_callback, tensorboard_callback],
verbose=0).history
print("Model training finished.")
calibration_data = pd.DataFrame()
calibration_data["predictions"] = model.predict_on_batch(features).flatten()
calibration_data["targets"] = pd.Series(labels)
display(calibration_data.describe())
root_mean_squared_error = history["root_mean_squared_error"][9]
print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
return calibration_data
Both the total_rooms
and population
features count totals for a given city block.
But what if one city block were more densely populated than another? We can explore how block density relates to median house value by creating a synthetic feature that's a ratio of total_rooms
and population
.
In the cell below, create a feature called rooms_per_person
, and use that as the input_feature
to train_model()
.
What's the best performance you can get with this single feature by tweaking the learning rate? (The better the performance, the better your regression line should fit the data, and the lower the final RMSE should be.)
In [ ]:
!rm -rf logs/synthetic_features_and_outliers
In [ ]:
california_housing_dataframe["rooms_per_person"] = (
california_housing_dataframe["total_rooms"] / california_housing_dataframe["population"])
calibration_data = fit_model(
learning_rate=0.00005,
steps_per_epoch=500,
batch_size=5,
input_feature="rooms_per_person"
)
In [ ]:
from google.datalab.ml import TensorBoard
TensorBoard().start('logs/synthetic_features_and_outliers')
We can visualize the performance of our model by creating a scatter plot of predictions vs. target values. Ideally, these would lie on a perfectly correlated diagonal line.
Use Pyplot's scatter()
to create a scatter plot of predictions vs. targets, using the rooms-per-person model you trained in Task 1.
Do you see any oddities? Trace these back to the source data by looking at the distribution of values in rooms_per_person
.
In [ ]:
logdir = "logs/synthetic_features_and_outliers/plots"
file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime("%Y%m%d-%H%M%S"))
figure = plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data["predictions"], calibration_data["targets"])
with file_writer.as_default():
tf.summary.image("Predictions vs Targets",
plot_to_image(figure),
step=0)
The calibration data shows most scatter points aligned to a line. The line is almost vertical, but we'll come back to that later. Right now let's focus on the ones that deviate from the line. We notice that they are relatively few in number.
If we plot a histogram of rooms_per_person
, we find that we have a few outliers in our input data:
In [ ]:
figure = plt.figure()
plt.subplot(1, 2, 2)
_ = california_housing_dataframe["rooms_per_person"].hist()
with file_writer.as_default():
tf.summary.image("Rooms per person",
plot_to_image(figure),
step=0)
In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')
See if you can further improve the model fit by setting the outlier values of rooms_per_person
to some reasonable minimum or maximum.
For reference, here's a quick example of how to apply a function to a Pandas Series
:
clipped_feature = my_dataframe["my_feature_name"].apply(lambda x: max(x, 0))
The above clipped_feature
will have no values less than 0
.
The histogram we created in Task 2 shows that the majority of values are less than 5
. Let's clip rooms_per_person
to 5, and plot a histogram to double-check the results.
In [ ]:
california_housing_dataframe["rooms_per_person"] = (
california_housing_dataframe["rooms_per_person"]).apply(lambda x: min(x, 5))
figure = plt.figure()
_ = california_housing_dataframe["rooms_per_person"].hist()
with file_writer.as_default():
tf.summary.image("Clipped Rooms per person",
plot_to_image(figure),
step=0)
In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')
To verify that clipping worked, let's train again and print the calibration data once more:
In [ ]:
calibration_data = fit_model(
learning_rate=0.05,
steps_per_epoch=1000,
batch_size=5,
input_feature="rooms_per_person")
In [ ]:
file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime("%Y%m%d-%H%M%S"))
figure = plt.figure()
_ = plt.scatter(calibration_data["predictions"], calibration_data["targets"])
with file_writer.as_default():
tf.summary.image("Predictions vs Targets",
plot_to_image(figure),
step=0)
In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')