Keras Feature Columns are not an officially released feature yet. Some caveats apply: please run this notebook on a GPU Backend. Keras Feature Columns are not comaptible with TPUs yet. Also, you will not be able to export this model to Tensorflow's "saved model" format for serving. The serving layer for feature columns will be added soon.
In [1]:
import os, json, math
import numpy as np
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc # This will change when Keras FeatureColumn is final.
from matplotlib import pyplot as plt
print("Tensorflow version " + tf.__version__)
tf.enable_eager_execution()
In [0]:
#@title display utilities [RUN ME]
# utility to display training and validation curves
def display_training_curves(training, validation, title, subplot):
if subplot%10==1: # set up the subplots on the first call
plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
plt.tight_layout()
ax = plt.subplot(subplot)
ax.set_facecolor('#F8F8F8')
ax.plot(training)
ax.plot(validation)
ax.set_title('model '+ title)
ax.set_ylabel(title)
ax.set_xlabel('epoch')
ax.legend(['train', 'valid.'])
In [0]:
# backend identification
IS_COLAB = 'COLAB_GPU' in os.environ # this is always set on Colab, the value is 0 or 1 depending on GPU presence
HAS_COLAB_TPU = 'COLAB_TPU_ADDR' in os.environ
# Auth on Colab
if IS_COLAB:
from google.colab import auth
auth.authenticate_user()
# Also propagate the Auth to TPU if available so that it can access your GCS buckets
if IS_COLAB and HAS_COLAB_TPU:
TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
with tf.Session(TF_MASTER) as sess:
with open('/content/adc.json', 'r') as f:
auth_info = json.load(f) # Upload the credentials to TPU.
tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)
print('Using TPU')
# TPU usage flag
USE_TPU = HAS_COLAB_TPU
In [0]:
DATA_BUCKET = "gs://cloud-training-demos/taxifare/ch4/taxi_preproc/"
TRAIN_DATA_PATTERN = DATA_BUCKET + "train*"
VALID_DATA_PATTERN = DATA_BUCKET + "valid*"
In [0]:
CSV_COLUMNS = ['fare_amount', 'dayofweek', 'hourofday', 'pickuplon','pickuplat','dropofflon','dropofflat','passengers', 'key']
DEFAULTS = [[0.0], ['null'], [12], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]
def decode_csv(line):
column_values = tf.decode_csv(line, DEFAULTS)
column_names = CSV_COLUMNS
decoded_line = dict(zip(column_names, column_values)) # create a dictionary {'column_name': value, ...} for each line
return decoded_line
def load_dataset(pattern):
#filenames = tf.gfile.Glob(pattern)
filenames = tf.data.Dataset.list_files(pattern)
#dataset = tf.data.TextLineDataset(filenames)
dataset = filenames.interleave(tf.data.TextLineDataset, cycle_length=16) # interleave so that reading happens from multiple files in parallel
dataset = dataset.map(decode_csv)
return dataset
In [6]:
dataset = load_dataset(TRAIN_DATA_PATTERN)
for n, data in enumerate(dataset):
numpy_data = {k: v.numpy() for k, v in data.items()} # .numpy() works in eager mode
print(numpy_data)
if n>10: break
In [0]:
def add_engineered(features):
# this is how you can do feature engineering in TensorFlow
distance = tf.sqrt((features['pickuplat'] - features['dropofflat'])**2 +
(features['pickuplon'] - features['dropofflon'])**2)
# euclidian distance is hard for a neural network to emulate
features['euclidean'] = distance
return features
def features_and_labels(features):
features = add_engineered(features)
features.pop('key') # this column not needed
label = features.pop('fare_amount') # this is what we will train for
return features, label
def prepare_dataset(dataset, batch_size, truncate=None, shuffle=True):
dataset = dataset.map(features_and_labels)
if truncate is not None:
dataset = dataset.take(truncate)
dataset = dataset.cache()
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat()
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(-1) # prefetch next batch while training (-1: autotune prefetch buffer size)
return dataset
one_item = load_dataset(TRAIN_DATA_PATTERN).map(features_and_labels).take(1).batch(1)
numeric_col = fc.numeric_column('name')
bucketized_numeric_col = fc.bucketized_column(fc.numeric_column('name'), [0, 2, 10])
indic_of_categ_col = fc.indicator_column(fc.categorical_column_with_identity('name', num_buckets = 24))
indic_of_categ_vocab_col = fc.indicator_column(fc.categorical_column_with_identity('color', vocabulary_list = ['red', 'blue']))
indic_of_crossed_col = fc.indicator_column(fc.crossed_column([categcol1, categcol2], 16*16))
embedding_of_crossed_col = fc.embedding_column(fc.crossed_column([categcol1, categcol2], 16*16), 5)
| column | output vector shape | nb of parameters |
|---|---|---|
| numeric_col | [1] | 0 |
| bucketized_numeric_col | [bucket boundaries+1] | 0 |
| indic_of_categ_col | [nb categories] | 0 |
| indic_of_categ_vocab_col | [nb categories] | 0 |
| indic_of_crossed_col | [nb crossed categories] | 0 |
| embedding_of_crossed_col | [nb crossed categories] | crossed categories * embedding size |
The big wins were bucketizing the coordinates and adding the euclidian distance. The cross column add only a little, and only if you train for longer. Try training on 10x the training and validation data. With crossed columns you should be able to reach RMSE=3.9
In [11]:
NB_BUCKETS = 16
latbuckets = np.linspace(38.0, 42.0, NB_BUCKETS).tolist()
lonbuckets = np.linspace(-76.0, -72.0, NB_BUCKETS).tolist()
# the columns you can play with
# Categorical columns are used as:
# fc.indicator_column(dayofweek)
dayofweek = fc.categorical_column_with_vocabulary_list('dayofweek', vocabulary_list = ['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat'])
hourofday = fc.categorical_column_with_identity('hourofday', num_buckets = 24)
# Bucketized columns can be used as such:
bucketized_pick_lat = fc.bucketized_column(fc.numeric_column('pickuplon'), lonbuckets)
bucketized_pick_lon = fc.bucketized_column(fc.numeric_column('pickuplat'), latbuckets)
bucketized_drop_lat = fc.bucketized_column(fc.numeric_column('dropofflon'), lonbuckets)
bucketized_drop_lon = fc.bucketized_column(fc.numeric_column('dropofflat'), latbuckets)
# Cross columns are used as
# fc.embedding_column(day_hr, 5)
day_hr = fc.crossed_column([dayofweek, hourofday], 24 * 7)
pickup_cross = fc.crossed_column([bucketized_pick_lat, bucketized_pick_lon], NB_BUCKETS * NB_BUCKETS)
drofoff_cross = fc.crossed_column([bucketized_drop_lat, bucketized_drop_lon], NB_BUCKETS * NB_BUCKETS)
#pickdorp_pair = fc.crossed_column([pickup_cross, ddropoff_cross], NB_BUCKETS ** 4 )
columns = [
###
#
# YOUR FEATURE COLUMNS HERE
#
fc.numeric_column('passengers'),
##
]
l = tf.keras.layers
model = tf.keras.Sequential(
[
fc.FeatureLayer(columns),
l.Dense(100, activation='relu'),
l.Dense(64, activation='relu'),
l.Dense(32, activation='relu'),
l.Dense(16, activation='relu'),
l.Dense(1, activation=None), # regression
])
def rmse(y_true, y_pred): # Root Mean Squared Error
return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))
def mae(y_true, y_pred): # Mean Squared Error
return tf.reduce_mean(tf.abs(y_pred - y_true))
model.compile(optimizer=tf.train.AdamOptimizer(), # little bug: in eager mode, 'adam' is not yet accepted, must spell out tf.train.AdamOptimizer()
loss='mean_squared_error',
metrics=[rmse])
# print model layers
model.predict(one_item, steps=1) # little bug: with FeatureLayer, must call the model once on dummy data before .summary can work
model.summary()
In [12]:
EPOCHS = 8
BATCH_SIZE = 512
TRAIN_SIZE = 64*1024 # max is 2,141,023
VALID_SIZE = 4*1024 # max is 2,124,500
# Playground settings: TRAIN_SIZE = 64*1024, VALID_SIZE = 4*1024
# Solution settings: TRAIN_SIZE = 640*1024, VALID_SIZE = 64*1024
# This should reach RMSE = 3.9 (multiple runs may be necessary)
train_dataset = prepare_dataset(load_dataset(TRAIN_DATA_PATTERN), batch_size=BATCH_SIZE, truncate=TRAIN_SIZE)
valid_dataset = prepare_dataset(load_dataset(VALID_DATA_PATTERN), batch_size=BATCH_SIZE, truncate=VALID_SIZE, shuffle=False)
history = model.fit(train_dataset, steps_per_epoch=TRAIN_SIZE//BATCH_SIZE, epochs=EPOCHS, shuffle=True,
validation_data=valid_dataset, validation_steps=VALID_SIZE//BATCH_SIZE)
In [13]:
print(history.history.keys())
display_training_curves(history.history['rmse'], history.history['val_rmse'], 'accuracy', 211)
display_training_curves(history.history['loss'], history.history['val_loss'], 'loss', 212)
In [0]: