Synthetic Features and Outliers

Learning Objectives:

  • Create a synthetic feature that is the ratio of two other features
  • Use this new feature as an input to a linear regression model
  • Improve the effectiveness of the model by identifying and clipping (removing) outliers out of the input data

Setup

Install latest 2.x.x release for tensorflow


In [ ]:
!pip install tensorflow==2.0.0-beta1

In [ ]:
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import logging
from packaging import version
from IPython.display import display

In [ ]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
logging.getLogger('tensorflow').disabled = True

In [ ]:
import tensorflow as tf

%load_ext tensorboard

In [ ]:
from datetime import datetime
import io

logging.getLogger('tensorboard').disabled = True

california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

We'll set up our plot_to_image function to convert the matplotlib plot specified by figure to a PNG image


In [ ]:
def plot_to_image(figure):
  """Converts the matplotlib plot specified by 'figure' to a PNG image and
  returns it. The supplied figure is closed and inaccessible after this call."""
  # Save the plot to a PNG in memory.
  buf = io.BytesIO()
  plt.savefig(buf, format='png')
  # Closing the figure prevents it from being displayed directly inside
  # the notebook.
  plt.close(figure)
  buf.seek(0)
  # Convert PNG buffer to TF image
  image = tf.image.decode_png(buf.getvalue(), channels=4)
  # Add the batch dimension
  image = tf.expand_dims(image, 0)
  return image

Next, we'll define the function for model training


In [ ]:
def fit_model(learning_rate,
              steps_per_epoch,
              batch_size,
              input_feature):
  """Trains a linear regression model of one feature.
  
  Args:
    learning_rate: A `float`, the learning rate.
    steps_per_epoch: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    input_feature: A `string` specifying a column from `california_housing_dataframe`
      to use as input feature.
  Returns:
    A Pandas `DataFrame` containing targets and the corresponding predictions done
    after training the model.
  """
  
  epochs = 10
  features = california_housing_dataframe[[input_feature]].values
  label = "median_house_value"
  labels = california_housing_dataframe[label].values

  model = tf.keras.models.Sequential([
      tf.keras.layers.Dense(1, activation='linear', kernel_initializer='zeros')
  ])
  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=5.0),
                loss='mse',
                metrics=[tf.keras.metrics.RootMeanSquaredError()])
  
  sample = california_housing_dataframe.sample(n=300)
  logdir = "logs/synthetic_features_and_outliers/plots" + datetime.now().strftime("%Y%m%d-%H%M%S")
  scalars_logdir = "logs/synthetic_features_and_outliers/scalars" + datetime.now().strftime("%Y%m%d-%H%M%S")
  file_writer = tf.summary.create_file_writer(logdir)
  
  # Set up to plot the state of our model's line each epoch.
  def create_plt_params(feature, label, epochs=10):
    colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, epochs)]
    return (colors,
            (sample[feature].min(), sample[feature].max()),
            (0, sample[label].max()))
    
  def create_figure(feature, label, epochs=10):
    figure = plt.figure(figsize=(15, 6))
    plt.title("Learned Line by Epoch")
    plt.ylabel(label)
    plt.xlabel(feature)
    plt.scatter(sample[feature], sample[label])
    return figure

  colors, x_min_max, y_min_max = create_plt_params(input_feature, label, epochs)

  def log(epoch, logs):
    root_mean_squared_error = logs["root_mean_squared_error"]
    print("  epoch %02d : %0.2f" % (epoch, root_mean_squared_error))

    weight, bias = [x.flatten()[0] for x in model.layers[0].get_weights()]

    # Apply some math to ensure that the data and line are plotted neatly.
    y_extents = np.array(y_min_max)
    x_extents = (y_extents - bias) / weight
    x_extents = np.maximum(np.minimum(x_extents,
                                      x_min_max[1]),
                           x_min_max[0])
    y_extents = weight * x_extents + bias
    figure = create_figure(input_feature, label, epochs)
    plt.plot(x_extents, y_extents, color=colors[epoch]) 
    with file_writer.as_default():
      tf.summary.image("Learned Line by Epoch",
                       plot_to_image(figure),
                       step=epoch)
      
  model_callback = tf.keras.callbacks.LambdaCallback(
      on_epoch_end=lambda epoch, logs: log(epoch, logs))
  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=scalars_logdir,
                                                       update_freq='epoch')
  
  print("Train model...")
  print("RMSE (on training data):")
  history = model.fit(features,
            labels,
            epochs=epochs,
            steps_per_epoch=steps_per_epoch,
            batch_size=batch_size,
            callbacks=[model_callback, tensorboard_callback],
            verbose=0).history
  print("Model training finished.")

  calibration_data = pd.DataFrame()
  calibration_data["predictions"] = model.predict_on_batch(features).flatten()
  calibration_data["targets"] = pd.Series(labels)
  display(calibration_data.describe())
  root_mean_squared_error = history["root_mean_squared_error"][9]
  print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
  
  return calibration_data

Task 1: Try a Synthetic Feature

Both the total_rooms and population features count totals for a given city block.

But what if one city block were more densely populated than another? We can explore how block density relates to median house value by creating a synthetic feature that's a ratio of total_rooms and population.

In the cell below, create a feature called rooms_per_person, and use that as the input_feature to train_model().

What's the best performance you can get with this single feature by tweaking the learning rate? (The better the performance, the better your regression line should fit the data, and the lower the final RMSE should be.)


In [ ]:
!rm -rf logs/synthetic_features_and_outliers

In [ ]:
california_housing_dataframe["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] / california_housing_dataframe["population"])

calibration_data = fit_model(
    learning_rate=0.00005,
    steps_per_epoch=500,
    batch_size=5,
    input_feature="rooms_per_person"
)

In [ ]:
from google.datalab.ml import TensorBoard
TensorBoard().start('logs/synthetic_features_and_outliers')

Task 2: Identify Outliers

We can visualize the performance of our model by creating a scatter plot of predictions vs. target values. Ideally, these would lie on a perfectly correlated diagonal line.

Use Pyplot's scatter() to create a scatter plot of predictions vs. targets, using the rooms-per-person model you trained in Task 1.

Do you see any oddities? Trace these back to the source data by looking at the distribution of values in rooms_per_person.


In [ ]:
logdir = "logs/synthetic_features_and_outliers/plots"
file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime("%Y%m%d-%H%M%S"))

figure = plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data["predictions"], calibration_data["targets"])
with file_writer.as_default():
  tf.summary.image("Predictions vs Targets",
                   plot_to_image(figure),
                   step=0)

The calibration data shows most scatter points aligned to a line. The line is almost vertical, but we'll come back to that later. Right now let's focus on the ones that deviate from the line. We notice that they are relatively few in number.

If we plot a histogram of rooms_per_person, we find that we have a few outliers in our input data:


In [ ]:
figure = plt.figure()
plt.subplot(1, 2, 2)
_ = california_housing_dataframe["rooms_per_person"].hist()
with file_writer.as_default():
  tf.summary.image("Rooms per person",
                   plot_to_image(figure),
                   step=0)

In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')

Task 3: Clip Outliers

See if you can further improve the model fit by setting the outlier values of rooms_per_person to some reasonable minimum or maximum.

For reference, here's a quick example of how to apply a function to a Pandas Series:

clipped_feature = my_dataframe["my_feature_name"].apply(lambda x: max(x, 0))

The above clipped_feature will have no values less than 0.

The histogram we created in Task 2 shows that the majority of values are less than 5. Let's clip rooms_per_person to 5, and plot a histogram to double-check the results.


In [ ]:
california_housing_dataframe["rooms_per_person"] = (
    california_housing_dataframe["rooms_per_person"]).apply(lambda x: min(x, 5))
figure = plt.figure()
_ = california_housing_dataframe["rooms_per_person"].hist()
with file_writer.as_default():
  tf.summary.image("Clipped Rooms per person",
                   plot_to_image(figure),
                   step=0)

In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')

To verify that clipping worked, let's train again and print the calibration data once more:


In [ ]:
calibration_data = fit_model(
    learning_rate=0.05,
    steps_per_epoch=1000,
    batch_size=5,
    input_feature="rooms_per_person")

In [ ]:
file_writer = tf.summary.create_file_writer(logdir + datetime.now().strftime("%Y%m%d-%H%M%S"))
figure = plt.figure()
_ = plt.scatter(calibration_data["predictions"], calibration_data["targets"])
with file_writer.as_default():
  tf.summary.image("Predictions vs Targets",
                   plot_to_image(figure),
                   step=0)

In [ ]:
TensorBoard().start('logs/synthetic_features_and_outliers')