In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Learning Objectives:
Please make a copy of this Colab notebook before starting this lab. To do so, choose File->Save a copy in Drive.
In [0]:
%reset -f
import numpy as np
import pandas as pd
import math
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
We use a package called Pandas for reading in our data, exploring our data and doing some basic processing. It is really helpful for datasets that fit in memory! And it has some nice integrations, as you will see.
First we set up some options to control how items are displayed and the maximum number of rows to show when displaying a table. Feel free to change this setup to whatever you'd like.
In [0]:
# Set pandas output display to have one digit for decimal places and limit it to
# printing 15 rows.
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_rows = 15
The car data set we will be using in this lab is provided as a comma separated file without a header row. In order for each column to have a meaningful header name we must provide it. We get the information about the columns from the Automobile Data Set.
We will use the features of the car, to try to predict its price.
In [0]:
# Provide the names for the columns since the CSV file with the data does
# not have a header row.
feature_names = ['symboling', 'normalized-losses', 'make', 'fuel-type',
'aspiration', 'num-doors', 'body-style', 'drive-wheels',
'engine-location', 'wheel-base', 'length', 'width', 'height', 'weight',
'engine-type', 'num-cylinders', 'engine-size', 'fuel-system', 'bore',
'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
'highway-mpg', 'price']
# Load in the data from a CSV file that is comma separated.
car_data = pd.read_csv('https://storage.googleapis.com/mledu-datasets/cars_data.csv',
sep=',', names=feature_names, header=None, encoding='latin-1')
# We'll then randomize the data, just to be sure not to get any pathological
# ordering effects that might harm the performance of Stochastic Gradient
# Descent.
car_data = car_data.reindex(np.random.permutation(car_data.index))
print("Data set loaded. Num examples: ", len(car_data))
This is a really small dataset! Only 205 examples.
For simplicity in this codelab, we do not split the data further into training and validation. But you MUST do this on real datasets, or else you will overfit to your single dataset.
Useful functions:
type() called on any Python object describes the type of the objectdataframe[4:7] pulls out rows 4, 5, 6 in a Pandas dataframedataframe[['mycol1', 'mycol2']] pulls out the two requested columns into a new Pandas dataframedataframe['mycol1'] returns a Pandas series -- not a dataframe!dataframe.describe() prints out statistics for each dataframe column
In [0]:
car_data[4:7]
In [0]:
LABEL = 'price'
numeric_feature_names = car_data[['symboling','normalized-losses','wheel-base','engine-size','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']]
categorical_feature_names = list(set(feature_names) - set(numeric_feature_names) - set([LABEL]))
# The correct solution will pass these assert statements.
assert len(numeric_feature_names) == 15
assert len(categorical_feature_names) == 10
In [0]:
#@title Solution (to view code, from cell's menu, select Form -> Show Code)
numeric_feature_names = ['symboling', 'normalized-losses', 'wheel-base',
'length', 'width', 'height', 'weight', 'engine-size', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'bore', 'stroke',
'compression-ratio']
categorical_feature_names = list(set(feature_names) - set(numeric_feature_names) - set([LABEL]))
assert len(numeric_feature_names) == 15
assert len(categorical_feature_names) == 10
In [0]:
# Run to inspect numeric features.
car_data[numeric_feature_names]
In [0]:
# Run to inspect categorical features.
car_data[categorical_feature_names]
In [0]:
# Coerce the numeric features to numbers. This is necessary because the model
# crashes because not all the values are numeric.
for feature_name in numeric_feature_names + [LABEL]:
car_data[feature_name] = pd.to_numeric(car_data[feature_name], errors='coerce')
# Fill missing values with 0.
# Is this an OK thing to do? You may want to come back and revisit this decision later.
car_data.fillna(0, inplace=True)
Modify the model provided below to achieve the lowest eval loss. You may want to change various hyperparameters:
Do not use the normalizer_fn arg on numeric_column.
In [0]:
# This code "works", but because of bad hyperparameter choices it gets NaN loss
# during training. Try fixing this.
batch_size = 16
print(numeric_feature_names)
x_df = car_data[numeric_feature_names]
y_series = car_data['price']
# Create input_fn's so that the estimator knows how to read in your data.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
# Feature columns allow the model to parse the data, perform common
# preprocessing, and automatically generate an input layer for the tf.Estimator.
model_feature_columns = [
tf.feature_column.numeric_column(feature_name) for feature_name in numeric_feature_names
]
print('model_feature_columns', model_feature_columns)
est = tf.estimator.DNNRegressor(
feature_columns=model_feature_columns,
hidden_units=[64],
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.01),
)
# TRAIN
num_print_statements = 10
num_training_steps = 10000
for _ in range(num_print_statements):
est.train(train_input_fn, steps=num_training_steps // num_print_statements)
scores = est.evaluate(eval_input_fn)
# The `scores` dictionary has several metrics automatically generated by the
# canned Estimator.
# `average_loss` is the average loss for an individual example.
# `loss` is the summed loss for the batch.
# In addition to these scalar losses, you may find the visualization functions
# in the next cell helpful for debugging model quality.
print('scores', scores)
In [0]:
#@title Possible solution
# Here is one possible solution:
# The only necessary change to fix the NaN training loss was the choice of optimizer.
# Changing other parameters could improve model quality, but take it with a
# grain of salt. The dataset is very small.
batch_size = 16
print(numeric_feature_names)
x_df = car_data[numeric_feature_names]
y_series = car_data['price']
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
# Feature columns allow the model to parse the data, perform common
# preprocessing, and automatically generate an input layer for the tf.Estimator.
model_feature_columns = [
tf.feature_column.numeric_column(feature_name) for feature_name in numeric_feature_names
]
print('model_feature_columns', model_feature_columns)
est = tf.estimator.DNNRegressor(
feature_columns=model_feature_columns,
hidden_units=[64],
optimizer=tf.train.AdagradOptimizer(learning_rate=0.01),
)
# TRAIN
num_print_statements = 10
num_training_steps = 10000
for _ in range(num_print_statements):
est.train(train_input_fn, steps=num_training_steps // num_print_statements)
scores = est.evaluate(eval_input_fn)
# The `scores` dictionary has several metrics automatically generated by the
# canned Estimator.
# `average_loss` is the average loss for an individual example.
# `loss` is the summed loss for the batch.
# In addition to these scalar losses, you may find the visualization functions
# in the next cell helpful for debugging model quality.
print('scores', scores)
In [0]:
from matplotlib import pyplot as plt
def scatter_plot_inference_grid(est, x_df, feature_names):
"""Plots the predictions of the model against each feature.
Args:
est: The trained tf.Estimator.
x_df: The pandas dataframe with the input data (used to create
predict_input_fn).
feature_names: An iterable of string feature names to plot.
"""
def scatter_plot_inference(axis,
x_axis_feature_name,
y_axis_feature_name,
predictions):
"""Generate one subplot."""
# Plot the real data in grey.
y_axis_feature_name = 'price'
axis.set_ylabel(y_axis_feature_name)
axis.set_xlabel(x_axis_feature_name)
axis.scatter(car_data[x_axis_feature_name],
car_data[y_axis_feature_name],
c='grey')
# Plot the predicted data in orange.
axis.scatter(car_data[x_axis_feature_name], predictions, c='orange')
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
predictions = [
x['predictions'][0]
for x in est.predict(predict_input_fn)
]
num_cols = 3
num_rows = int(math.ceil(len(feature_names)/float(num_cols)))
f, axarr = plt.subplots(num_rows, num_cols)
size = 4.5
f.set_size_inches(num_cols*size, num_rows*size)
for i, feature_name in enumerate(numeric_feature_names):
axis = axarr[int(i/num_cols), i%num_cols]
scatter_plot_inference(axis, feature_name, 'price', predictions)
plt.show()
scatter_plot_inference_grid(est, x_df, numeric_feature_names)
You will need to use the normalizer_fn arg on numeric_column
An example of a silly normalizer_fn that shifts inputs down by 1, and then negates the value:
normalizer_fn = lambda x: tf.neg(tf.subtract(x, 1))
You may find these pandas functions helpful:
Does normalization improve model quality on this dataset? Why or why not?
In [0]:
# This 1D visualization of each numeric feature might inform your normalization
# decisions.
for feature_name in numeric_feature_names:
car_data.hist(column=feature_name)
In [0]:
## Your code goes here
In [0]:
#@title Possible solution
# This does Z-score normalization since the distributions for most features looked
# roughly normally distributed.
# Z-score normalization subtracts the mean and divides by the standard deviation,
# to give a roughly standard normal distribution (mean = 0, std = 1) under a
# normal distribution assumption. Epsilon prevents divide by zero.
# With normalization, are you able to get the model working with
# GradientDescentOptimizer? Z-score normalization doesn't seem to be able to get
# SGD working. Maybe a different type of normalization would?
batch_size = 16
print(numeric_feature_names)
x_df = car_data[numeric_feature_names]
y_series = car_data['price']
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
# Epsilon prevents divide by zero.
epsilon = 0.000001
model_feature_columns = [
tf.feature_column.numeric_column(feature_name,
normalizer_fn=lambda val: (val - x_df.mean()[feature_name]) / (epsilon + x_df.std()[feature_name]))
for feature_name in numeric_feature_names
]
print('model_feature_columns', model_feature_columns)
est = tf.estimator.DNNRegressor(
feature_columns=model_feature_columns,
hidden_units=[64],
optimizer=tf.train.AdagradOptimizer(learning_rate=0.01),
)
# TRAIN
num_print_statements = 10
num_training_steps = 10000
for _ in range(num_print_statements):
est.train(train_input_fn, steps=num_training_steps // num_print_statements)
scores = est.evaluate(eval_input_fn)
# The `scores` dictionary has several metrics automatically generated by the
# canned Estimator.
# `average_loss` is the average loss for an individual example.
# `loss` is the summed loss for the batch.
# In addition to these scalar losses, you may find the visualization functions
# in the next cell helpful for debugging model quality.
print('scores', scores)
scatter_plot_inference_grid(est, x_df, numeric_feature_names)
In [0]:
## Your code goes here
In [0]:
#@title Possible solution
# We have the full list of values that each feature takes on, and the list is
# relatively small so we use categorical_column_with_vocabulary_list.
batch_size = 16
x_df = car_data[categorical_feature_names]
y_series = car_data['price']
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
model_feature_columns = [
tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary_list=car_data[feature_name].unique()))
for feature_name in categorical_feature_names
]
print('model_feature_columns', model_feature_columns)
est = tf.estimator.DNNRegressor(
feature_columns=model_feature_columns,
hidden_units=[64],
optimizer=tf.train.AdagradOptimizer(learning_rate=0.01),
)
# TRAIN
num_print_statements = 10
num_training_steps = 10000
for _ in range(num_print_statements):
est.train(train_input_fn, steps=num_training_steps // num_print_statements)
scores = est.evaluate(eval_input_fn)
# The `scores` dictionary has several metrics automatically generated by the
# canned Estimator.
# `average_loss` is the average loss for an individual example.
# `loss` is the summed loss for the batch.
# In addition to these scalar losses, you may find the visualization functions
# in the next cell helpful for debugging model quality.
print('scores', scores)
In [0]:
## Your code goes here
In [0]:
#@title Possible solution
# This is a first pass at a model that uses all the features.
# Do you have any improvements?
batch_size = 16
x_df = car_data[numeric_feature_names + categorical_feature_names]
y_series = car_data['price']
train_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
y=y_series,
batch_size=batch_size,
shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_df,
batch_size=batch_size,
shuffle=False)
epsilon = 0.000001
model_feature_columns = [
tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary_list=car_data[feature_name].unique()))
for feature_name in categorical_feature_names
] + [
tf.feature_column.numeric_column(feature_name,
normalizer_fn=lambda val: (val - x_df.mean()[feature_name]) / (epsilon + x_df.std()[feature_name]))
for feature_name in numeric_feature_names
]
print('model_feature_columns', model_feature_columns)
est = tf.estimator.DNNRegressor(
feature_columns=model_feature_columns,
hidden_units=[64],
optimizer=tf.train.AdagradOptimizer(learning_rate=0.01),
)
# TRAIN
num_print_statements = 10
num_training_steps = 10000
for _ in range(num_print_statements):
est.train(train_input_fn, steps=num_training_steps // num_print_statements)
scores = est.evaluate(eval_input_fn)
# The `scores` dictionary has several metrics automatically generated by the
# canned Estimator.
# `average_loss` is the average loss for an individual example.
# `loss` is the summed loss for the batch.
# In addition to these scalar losses, you may find the visualization functions
# in the next cell helpful for debugging model quality.
print('scores', scores)