Learning Objectives:
Estimator
class in TensorFlow to predict median housing priceThe data is based on 1990 census data from California. This data is at the city block level, so these features reflect the total number of rooms in that block, or the total number of people who live on that block, respectively.
Let's use a set of features to predict house value.
In [ ]:
import math
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
Next, we'll load our data set.
In [ ]:
df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep = ",")
In [ ]:
df.head()
In [ ]:
df.describe()
This data is at the city block level, so these features reflect the total number of rooms in that block, or the total number of people who live on that block, respectively. Let's create a different, more appropriate feature. Because we are predicing the price of a single house, we should try to make all our features correspond to a single house as well
In [ ]:
df['num_rooms'] = df['total_rooms'] / df['households']
df['num_bedrooms'] = df['total_bedrooms'] / df['households']
df['persons_per_house'] = df['population'] / df['households']
df.describe()
In [ ]:
df.drop(['total_rooms', 'total_bedrooms', 'population', 'households'], axis = 1, inplace = True)
df.describe()
In this exercise, we'll be trying to predict median_house_value
. It will be our label. We'll use the remaining columns as our input features.
To train our model, we'll use the Estimator API and create a custom estimator for linear regression.
Note that we don't actually need a custom estimator for linear regression since there is a canned estimator for it, however we're keeping it simple so you can practice creating a custom estimator function.
In [ ]:
# Define feature columns
feature_columns = {
colname : tf.feature_column.numeric_column(colname) \
for colname in ['housing_median_age','median_income','num_rooms','num_bedrooms','persons_per_house']
}
# Bucketize lat, lon so it's not so high-res; California is mostly N-S, so more lats than lons
feature_columns['longitude'] = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('longitude'), np.linspace(-124.3, -114.3, 5).tolist())
feature_columns['latitude'] = tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'), np.linspace(32.5, 42, 10).tolist())
In [ ]:
# Split into train and eval and create input functions
msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]
SCALE = 100000
BATCH_SIZE=128
train_input_fn = tf.estimator.inputs.pandas_input_fn(x = traindf[list(feature_columns.keys())],
y = traindf["median_house_value"] / SCALE,
num_epochs = None,
batch_size = BATCH_SIZE,
shuffle = True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x = evaldf[list(feature_columns.keys())],
y = evaldf["median_house_value"] / SCALE, # note the scaling
num_epochs = 1,
batch_size = len(evaldf),
shuffle=False)
In [ ]:
# Create the custom estimator
def custom_estimator(features, labels, mode, params):
# 0. Extract data from feature columns
input_layer = tf.feature_column.input_layer(features, params['feature_columns'])
# 1. Define Model Architecture
predictions = tf.layers.dense(input_layer,1,activation=None)
# 2. Loss function, training/eval ops
if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
labels = tf.expand_dims(tf.cast(labels, dtype=tf.float32), -1)
loss = tf.losses.mean_squared_error(labels, predictions)
optimizer = tf.train.FtrlOptimizer(learning_rate=0.2)
train_op = optimizer.minimize(
loss = loss,
global_step = tf.train.get_global_step())
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(labels*SCALE, predictions*SCALE)
}
else:
loss = None
train_op = None
eval_metric_ops = None
# 3. Create predictions
predictions_dict = {"predicted": predictions}
# 4. Create export outputs
export_outputs = {"regression_export_outputs": tf.estimator.export.RegressionOutput(value = predictions)}
# 5. Return EstimatorSpec
return tf.estimator.EstimatorSpec(
mode = mode,
predictions = predictions_dict,
loss = loss,
train_op = train_op,
eval_metric_ops = eval_metric_ops,
export_outputs = export_outputs)
In [ ]:
# Create serving input function
def serving_input_fn():
feature_placeholders = {
colname : tf.placeholder(tf.float32, [None]) for colname in 'housing_median_age,median_income,num_rooms,num_bedrooms,persons_per_house'.split(',')
}
feature_placeholders['longitude'] = tf.placeholder(tf.float32, [None])
feature_placeholders['latitude'] = tf.placeholder(tf.float32, [None])
features = {
key: tf.expand_dims(tensor, -1)
for key, tensor in feature_placeholders.items()
}
return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
In [ ]:
# Create custom estimator's train and evaluate function
def train_and_evaluate(output_dir):
estimator = tf.estimator.Estimator(
model_fn = custom_estimator,
model_dir = output_dir,
params={'feature_columns': list(feature_columns.values())})
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn,
max_steps = 1000)
exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn,
steps = None,
exporters = exporter)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
#Run Training
OUTDIR = 'custom_estimator_trained_model'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
train_and_evaluate(OUTDIR)
In [ ]:
def custom_estimator(features, labels, mode, params):
# 0. Extract data from feature columns
input_layer = tf.feature_column.input_layer(features, params['feature_columns'])
# 1. Define Model Architecture
predictions = tf.layers.dense(input_layer,10,activation=tf.nn.relu)
predictions = tf.layers.dense(input_layer,1,activation=None)
.....REST AS BEFORE