How to normalize features in TensorFlow


In [ ]:
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf


df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")

msk = np.random.rand(len(df)) < 0.8
traindf = df[msk]
evaldf = df[~msk]
traindf.head()

In [ ]:
def get_normalization_parameters(traindf, features):
  """Get the normalization parameters (E.g., mean, std) for traindf for 
  features. We will use these parameters for training, eval, and serving."""
  
  def _z_score_params(column):
    mean = traindf[column].mean()
    std = traindf[column].std()
    return {'mean': mean, 'std': std}
  
  normalization_parameters = {}
  for column in features:
    normalization_parameters[column] = _z_score_params(column)
  return normalization_parameters

NUMERIC_FEATURES = ['housing_median_age', 'total_rooms', 'total_bedrooms',
                    'population', 'households', 'median_income']
normalization_parameters = get_normalization_parameters(traindf,
                                                        NUMERIC_FEATURES)
normalization_parameters

In [ ]:
def _numeric_column_normalized(column_name, normalizer_fn):
  return tf.feature_column.numeric_column(column_name,
                                          normalizer_fn=normalizer_fn)


# Define your feature columns
def create_feature_cols(features, use_normalization):
  """Create our feature columns using tf.feature_column. This function will 
  get executed during training, evaluation, and serving."""
  normalized_feature_columns = []
  for column_name in features:
    if use_normalization:
      column_params = normalization_parameters[column_name]
      mean = column_params['mean']
      std = column_params['std']
      def normalize_column(col):  # Use mean, std defined above.
        return (col - mean)/std
      normalizer_fn = normalize_column
    else:
      normalizer_fn = None
    normalized_feature_columns.append(_numeric_column_normalized(column_name,
                                                              normalizer_fn))
  return normalized_feature_columns

In [ ]:
def input_fn(df, shuffle=True):
  """For training and evaluation inputs."""
  return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = df["median_house_value"]/100000,  # Scale target.
    shuffle = shuffle)

def train_and_evaluate(use_normalization, outdir):
  shutil.rmtree(outdir, ignore_errors = True) # start fresh each time
  
  feature_columns = create_feature_cols(NUMERIC_FEATURES, use_normalization)
  
  run_config = tf.estimator.RunConfig(save_summary_steps=10,
                                      model_dir = outdir  # More granular checkpointing for TensorBoard.
                                     )
  model = tf.estimator.LinearRegressor(feature_columns = feature_columns, config=run_config)
  # Training input function.
  train_spec = tf.estimator.TrainSpec(input_fn=input_fn(traindf),
                                      max_steps=1000)

  
  def json_serving_input_fn():
    """Build the serving inputs. For serving real-time predictions
    using ml-engine."""
    inputs = {}
    for feat in feature_columns:
      inputs[feat.name] = tf.placeholder(shape=[None], dtype=feat.dtype)
    return tf.estimator.export.ServingInputReceiver(inputs, inputs)
  
  # Evaluation and serving input function.
  exporter = tf.estimator.FinalExporter('housing', json_serving_input_fn)
  eval_spec = tf.estimator.EvalSpec(input_fn=input_fn(evaldf),
                                    exporters=[exporter],
                                    name='housing-eval')
  # Train and evaluate the model.
  tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
  return model

In [ ]:
results = train_and_evaluate(False, 'housing_trained')
normalized_results = train_and_evaluate(True, 'housing_trained_normalization')

Deploy on Google Cloud ML Engine

Test model training locally


In [ ]:
%%bash
OUTPUT_DIR='trained_model'
export PYTHONPATH=${PYTHONPATH}:${PWD}/model_code
python -m trainer.task --outdir $OUTPUT_DIR --normalize_input 1

Train on the cloud

Test cloud parameters (optional)


In [ ]:
%%bash 
OUTPUT_DIR='housing_trained_model'
JOBNAME=my_ml_job_$(date -u +%y%m%d_%H%M%S)
REGION='us-central1'
PACKAGE_PATH=$PWD/model_code/trainer

gcloud ml-engine local train\
    --package-path=$PACKAGE_PATH\
    --module-name=trainer.task\
    --\
    --outdir=$OUTPUT_DIR\
    --normalize_input=0

Submit job


In [ ]:
%%bash
JOBNAME=my_ml_job_$(date -u +%y%m%d_%H%M%S)
REGION='us-central1'
BUCKET='gs://crawles-sandbox'
OUTPUT_DIR=$BUCKET/'housing_trained_model'
PACKAGE_PATH=$PWD/model_code/trainer

gcloud ml-engine jobs submit training $JOBNAME \
        --package-path=$PACKAGE_PATH \
        --module-name=trainer.task \
        --region=$REGION \
        --staging-bucket=$BUCKET\
        --scale-tier=BASIC \
        --runtime-version=1.8 \
        -- \
        --outdir=$OUTPUT_DIR\
        --normalize_input=0