Keras Feature Columns are not an officially released feature yet. Some caveats apply: please run this notebook on a GPU Backend. Keras Feature Columns are not comaptible with TPUs yet. Also, you will not be able to export this model to Tensorflow's "saved model" format for serving. The serving layer for feature columns will be added soon.

Imports


In [1]:
import os, json, math
import numpy as np
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc  # This will change when Keras FeatureColumn is final.
from matplotlib import pyplot as plt
print("Tensorflow version " + tf.__version__)
tf.enable_eager_execution()


Tensorflow version 1.12.0

In [0]:
#@title display utilities [RUN ME]
# utility to display training and validation curves
def display_training_curves(training, validation, title, subplot):
  if subplot%10==1: # set up the subplots on the first call
    plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
    plt.tight_layout()
  ax = plt.subplot(subplot)
  ax.set_facecolor('#F8F8F8')
  ax.plot(training)
  ax.plot(validation)
  ax.set_title('model '+ title)
  ax.set_ylabel(title)
  ax.set_xlabel('epoch')
  ax.legend(['train', 'valid.'])

Colab-only auth


In [0]:
# backend identification
IS_COLAB = 'COLAB_GPU' in os.environ  # this is always set on Colab, the value is 0 or 1 depending on GPU presence
HAS_COLAB_TPU = 'COLAB_TPU_ADDR' in os.environ

# Auth on Colab
if IS_COLAB:
  from google.colab import auth
  auth.authenticate_user()
  
# Also propagate the Auth to TPU if available so that it can access your GCS buckets
if IS_COLAB and HAS_COLAB_TPU:
  TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
  with tf.Session(TF_MASTER) as sess:    
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f) # Upload the credentials to TPU.
    tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)
  print('Using TPU')

# TPU usage flag
USE_TPU = HAS_COLAB_TPU

Config


In [0]:
DATA_BUCKET = "gs://cloud-training-demos/taxifare/ch4/taxi_preproc/"
TRAIN_DATA_PATTERN = DATA_BUCKET + "train*"
VALID_DATA_PATTERN = DATA_BUCKET + "valid*"

In [0]:
CSV_COLUMNS = ['fare_amount', 'dayofweek', 'hourofday', 'pickuplon','pickuplat','dropofflon','dropofflat','passengers', 'key']
DEFAULTS = [[0.0], ['null'], [12], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]

def decode_csv(line):
  column_values = tf.decode_csv(line, DEFAULTS)
  column_names = CSV_COLUMNS
  decoded_line = dict(zip(column_names, column_values)) # create a dictionary {'column_name': value, ...} for each line  
  return decoded_line

def load_dataset(pattern):
  #filenames = tf.gfile.Glob(pattern)
  filenames = tf.data.Dataset.list_files(pattern)
  #dataset = tf.data.TextLineDataset(filenames)
  dataset = filenames.interleave(tf.data.TextLineDataset, cycle_length=16)  # interleave so that reading happens from multiple files in parallel
  dataset = dataset.map(decode_csv)
  return dataset

In [6]:
dataset = load_dataset(TRAIN_DATA_PATTERN)
for n, data in enumerate(dataset):
  numpy_data = {k: v.numpy() for k, v in data.items()} # .numpy() works in eager mode
  print(numpy_data)
  if n>10: break


{'fare_amount': 2.5, 'dayofweek': b'Thu', 'hourofday': 0, 'pickuplon': -73.97609, 'pickuplat': 40.764126, 'dropofflon': -73.97421, 'dropofflat': 40.763348, 'passengers': 1.0, 'key': b'2010-03-18 00:02:09.000000-73.976140.764140.7633-73.9742'}
{'fare_amount': 2.5, 'dayofweek': b'Wed', 'hourofday': 0, 'pickuplon': -74.00178, 'pickuplat': 40.735416, 'dropofflon': -74.00195, 'dropofflat': 40.734978, 'passengers': 1.0, 'key': b'2014-03-05 00:07:00.000000-74.001840.735440.735-74.002'}
{'fare_amount': 5.3, 'dayofweek': b'Wed', 'hourofday': 22, 'pickuplon': -74.00663, 'pickuplat': 40.71627, 'dropofflon': -73.99982, 'dropofflat': 40.731148, 'passengers': 1.0, 'key': b'2010-12-22 22:44:00.000000-74.006640.716340.7311-73.9998'}
{'fare_amount': 4.1, 'dayofweek': b'Sun', 'hourofday': 11, 'pickuplon': -73.98547, 'pickuplat': 40.759377, 'dropofflon': -73.99159, 'dropofflat': 40.75146, 'passengers': 1.0, 'key': b'2009-07-05 11:04:32.000000-73.985540.759440.7515-73.9916'}
{'fare_amount': 19.0, 'dayofweek': b'Sat', 'hourofday': 8, 'pickuplon': -74.006165, 'pickuplat': 40.7051, 'dropofflon': -73.980095, 'dropofflat': 40.75247, 'passengers': 2.0, 'key': b'2013-03-09 08:30:00.000000-74.006240.705140.7525-73.9801'}
{'fare_amount': 10.9, 'dayofweek': b'Tue', 'hourofday': 16, 'pickuplon': -73.9871, 'pickuplat': 40.72528, 'dropofflon': -73.99647, 'dropofflat': 40.75305, 'passengers': 1.0, 'key': b'2012-05-29 16:16:03.000000-73.987140.725340.7531-73.9965'}
{'fare_amount': 16.1, 'dayofweek': b'Sun', 'hourofday': 15, 'pickuplon': -74.0107, 'pickuplat': 40.710037, 'dropofflon': -73.96543, 'dropofflat': 40.75697, 'passengers': 2.0, 'key': b'2009-05-03 15:51:13.000000-74.010740.7140.757-73.9654'}
{'fare_amount': 6.5, 'dayofweek': b'Wed', 'hourofday': 6, 'pickuplon': -74.010414, 'pickuplat': 40.72008, 'dropofflon': -74.008514, 'dropofflat': 40.707844, 'passengers': 1.0, 'key': b'2013-09-04 06:34:00.000000-74.010440.720140.7078-74.0085'}
{'fare_amount': 15.5, 'dayofweek': b'Fri', 'hourofday': 19, 'pickuplon': -73.971924, 'pickuplat': 40.745842, 'dropofflon': -74.00344, 'dropofflat': 40.732437, 'passengers': 1.0, 'key': b'2015-02-06 19:48:43.000000-73.971940.745840.7324-74.0034'}
{'fare_amount': 2.5, 'dayofweek': b'Fri', 'hourofday': 0, 'pickuplon': -73.99397, 'pickuplat': 40.75142, 'dropofflon': -73.99095, 'dropofflat': 40.755474, 'passengers': 1.0, 'key': b'2014-08-29 00:29:00.000000-73.99440.751440.7555-73.991'}
{'fare_amount': 7.3, 'dayofweek': b'Thu', 'hourofday': 14, 'pickuplon': -73.825165, 'pickuplat': 40.745167, 'dropofflon': -73.82816, 'dropofflat': 40.75099, 'passengers': 5.0, 'key': b'2010-05-13 14:53:00.000000-73.825240.745240.751-73.8282'}
{'fare_amount': 2.5, 'dayofweek': b'Sun', 'hourofday': 0, 'pickuplon': -73.9714, 'pickuplat': 40.745632, 'dropofflon': -73.97146, 'dropofflat': 40.74553, 'passengers': 2.0, 'key': b'2009-05-03 00:27:00.000000-73.971440.745640.7455-73.9715'}

In [0]:
def add_engineered(features):
    # this is how you can do feature engineering in TensorFlow
    distance = tf.sqrt((features['pickuplat'] - features['dropofflat'])**2 +
                       (features['pickuplon'] - features['dropofflon'])**2)
    
    # euclidian distance is hard for a neural network to emulate
    features['euclidean'] = distance
    return features

def features_and_labels(features):
  features = add_engineered(features)
  features.pop('key') # this column not needed
  label = features.pop('fare_amount') # this is what we will train for
  return features, label
  
def prepare_dataset(dataset, batch_size, truncate=None, shuffle=True):
  dataset = dataset.map(features_and_labels)
  if truncate is not None:
    dataset = dataset.take(truncate)
  dataset = dataset.cache()
  if shuffle:
    dataset = dataset.shuffle(10000)
  dataset = dataset.repeat()
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(-1) # prefetch next batch while training  (-1: autotune prefetch buffer size)
  return dataset

one_item = load_dataset(TRAIN_DATA_PATTERN).map(features_and_labels).take(1).batch(1)

Linear Keras model [WORK REQUIRED]

  1. What do the columns do ? Familiarize yourself with these column types.

numeric_col = fc.numeric_column('name')

bucketized_numeric_col = fc.bucketized_column(fc.numeric_column('name'), [0, 2, 10])

indic_of_categ_col = fc.indicator_column(fc.categorical_column_with_identity('name', num_buckets = 24))

indic_of_categ_vocab_col = fc.indicator_column(fc.categorical_column_with_identity('color', vocabulary_list = ['red', 'blue']))

indic_of_crossed_col = fc.indicator_column(fc.crossed_column([categcol1, categcol2], 16*16))

embedding_of_crossed_col = fc.embedding_column(fc.crossed_column([categcol1, categcol2], 16*16), 5)

column output vector shape nb of parameters
numeric_col [1] 0
bucketized_numeric_col [bucket boundaries+1] 0
indic_of_categ_col [nb categories] 0
indic_of_categ_vocab_col [nb categories] 0
indic_of_crossed_col [nb crossed categories] 0
embedding_of_crossed_col [nb crossed categories] crossed categories * embedding size
  1. Let's start with all the data in as simply as possible: numerical columns for numerical values, categorical (one-hot encoded) columns for categorical data like the day of the week or the hour of the day. Try training...
    • RSME flat at 8-9 ... not good
  2. Try to replace the numerical latitude and longitudes by their bucketized versions
    • RSME trains to 6 ... progress!
  3. Try to add an engineered feature like 'euclidean' for the distance traveled by the taxi
    • RMSE trains down to 4-5 ... progress ! The euclidian distance is really hard to emulate for a neural network. Look through the code to see how it was "engineered".
  4. Now add embedded crossed columns for:
    • hourofday x dayofweek
    • pickup neighborhood (bucketized pickup lon x bucketized pickup lat)
    • dropoff neighborhood (bucketized dropoff lon x bucketized dropoff lat)
    • is this better ?

The big wins were bucketizing the coordinates and adding the euclidian distance. The cross column add only a little, and only if you train for longer. Try training on 10x the training and validation data. With crossed columns you should be able to reach RMSE=3.9


In [11]:
NB_BUCKETS = 16
latbuckets = np.linspace(38.0, 42.0, NB_BUCKETS).tolist()
lonbuckets = np.linspace(-76.0, -72.0, NB_BUCKETS).tolist()


# the columns you can play with

# Categorical columns are used as:
# fc.indicator_column(dayofweek)
dayofweek = fc.categorical_column_with_vocabulary_list('dayofweek', vocabulary_list = ['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat'])
hourofday = fc.categorical_column_with_identity('hourofday', num_buckets = 24)

# Bucketized columns can be used as such:
bucketized_pick_lat = fc.bucketized_column(fc.numeric_column('pickuplon'), lonbuckets)
bucketized_pick_lon = fc.bucketized_column(fc.numeric_column('pickuplat'), latbuckets)
bucketized_drop_lat = fc.bucketized_column(fc.numeric_column('dropofflon'), lonbuckets)
bucketized_drop_lon = fc.bucketized_column(fc.numeric_column('dropofflat'), latbuckets)

# Cross columns are used as
# fc.embedding_column(day_hr, 5)
day_hr =  fc.crossed_column([dayofweek, hourofday], 24 * 7)
pickup_cross  = fc.crossed_column([bucketized_pick_lat, bucketized_pick_lon], NB_BUCKETS * NB_BUCKETS)
drofoff_cross = fc.crossed_column([bucketized_drop_lat, bucketized_drop_lon], NB_BUCKETS * NB_BUCKETS)
#pickdorp_pair  = fc.crossed_column([pickup_cross, ddropoff_cross], NB_BUCKETS ** 4 )
  
columns = [
    
    ###
    #
    # YOUR FEATURE COLUMNS HERE
    #
    fc.numeric_column('passengers'),
    ##
]

l = tf.keras.layers
model = tf.keras.Sequential(
    [
        fc.FeatureLayer(columns),
        l.Dense(100, activation='relu'),
        l.Dense(64, activation='relu'),
        l.Dense(32, activation='relu'),
        l.Dense(16, activation='relu'),
        l.Dense(1, activation=None), # regression
    ])

def rmse(y_true, y_pred): # Root Mean Squared Error
  return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

def mae(y_true, y_pred): # Mean Squared Error
  return tf.reduce_mean(tf.abs(y_pred - y_true))
  
model.compile(optimizer=tf.train.AdamOptimizer(), # little bug: in eager mode, 'adam' is not yet accepted, must spell out tf.train.AdamOptimizer()
              loss='mean_squared_error',
              metrics=[rmse])

# print model layers
model.predict(one_item, steps=1) # little bug: with FeatureLayer, must call the model once on dummy data before .summary can work
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
feature_layer_1 (FeatureLaye multiple                  0         
_________________________________________________________________
dense_5 (Dense)              multiple                  200       
_________________________________________________________________
dense_6 (Dense)              multiple                  6464      
_________________________________________________________________
dense_7 (Dense)              multiple                  2080      
_________________________________________________________________
dense_8 (Dense)              multiple                  528       
_________________________________________________________________
dense_9 (Dense)              multiple                  17        
=================================================================
Total params: 9,289
Trainable params: 9,289
Non-trainable params: 0
_________________________________________________________________

In [12]:
EPOCHS = 8
BATCH_SIZE = 512
TRAIN_SIZE = 64*1024  # max is 2,141,023
VALID_SIZE = 4*1024   # max is 2,124,500

# Playground settings: TRAIN_SIZE = 64*1024, VALID_SIZE = 4*1024
# Solution settings: TRAIN_SIZE = 640*1024, VALID_SIZE = 64*1024

# This should reach RMSE = 3.9 (multiple runs may be necessary)

train_dataset = prepare_dataset(load_dataset(TRAIN_DATA_PATTERN), batch_size=BATCH_SIZE, truncate=TRAIN_SIZE)
valid_dataset = prepare_dataset(load_dataset(VALID_DATA_PATTERN), batch_size=BATCH_SIZE, truncate=VALID_SIZE, shuffle=False)

history = model.fit(train_dataset, steps_per_epoch=TRAIN_SIZE//BATCH_SIZE, epochs=EPOCHS, shuffle=True,
                    validation_data=valid_dataset, validation_steps=VALID_SIZE//BATCH_SIZE)


Epoch 1/8
128/128 [==============================] - 20s 157ms/step - loss: 187.0750 - rmse: 13.4228 - val_loss: 59.7003 - val_rmse: 7.3774
Epoch 2/8
128/128 [==============================] - 2s 19ms/step - loss: 136.8302 - rmse: 11.4675 - val_loss: 52.0826 - val_rmse: 6.8467
Epoch 3/8
128/128 [==============================] - 2s 19ms/step - loss: 123.6562 - rmse: 10.8938 - val_loss: 47.9557 - val_rmse: 6.5524
Epoch 4/8
128/128 [==============================] - 2s 19ms/step - loss: 119.1638 - rmse: 10.6985 - val_loss: 47.9505 - val_rmse: 6.5567
Epoch 5/8
128/128 [==============================] - 2s 19ms/step - loss: 119.1454 - rmse: 10.7169 - val_loss: 47.3986 - val_rmse: 6.4993
Epoch 6/8
128/128 [==============================] - 2s 19ms/step - loss: 119.1191 - rmse: 10.6777 - val_loss: 47.8205 - val_rmse: 6.5445
Epoch 7/8
128/128 [==============================] - 2s 19ms/step - loss: 119.3316 - rmse: 10.7100 - val_loss: 47.7658 - val_rmse: 6.5388
Epoch 8/8
128/128 [==============================] - 2s 19ms/step - loss: 119.0250 - rmse: 10.6859 - val_loss: 47.6381 - val_rmse: 6.5254

In [13]:
print(history.history.keys())
display_training_curves(history.history['rmse'], history.history['val_rmse'], 'accuracy', 211)
display_training_curves(history.history['loss'], history.history['val_loss'], 'loss', 212)


dict_keys(['val_loss', 'val_rmse', 'loss', 'rmse'])

In [0]: