This is a full example of how to do time series prediction with TensorFlow high level APIs. We'll use the weather dataset available at big query and can be generated with the following query:
SELECT year, mo, da,
avg(temp) as avg_tmp,
avg(dewp) as avg_dewp,
avg(slp) as avg_slp
FROM `bigquery-public-data.noaa_gsod.gsod*`
WHERE temp <> 9999.9 and dewp <> 9999.9 and slp <> 9999.9
GROUP BY year, mo, da
ORDER BY year asc, mo asc, da asc
You can download the data from here.
We'll implement a RNN model using the Estimators API that given the average temperature of 10 days can predict the average temperature of the following day (11th).
In [46]:
# tensorflow
import tensorflow as tf
# rnn common functions
from tensorflow.contrib.learn.python.learn.estimators import rnn_common
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# helpers
import numpy as np
import pandas as pd
import csv
# enable tensorflow logs
tf.logging.set_verbosity(tf.logging.INFO)
We're using a weather dataset....[DESCRIBE DATA SET, HOW THE DATA WAS GENERATED AND OTHER DETAILS].
The goal is: based on features from the past days predict the avg temperature in the next day. More specifically we'll use the data from 10 days in sequence to predict the avg temperature in the next day.
In [47]:
df = pd.read_csv('weather.csv')
number_of_rows = len(df)
print('number of rows in the dataset:', number_of_rows)
print('how a row looks like:')
print(df.head(11))
print()
print("we don't the year mo da columns, so let's forget about them")
df = df[['avg_tmp', 'avg_dewp', 'avg_slp']]
print(df.head(11))
In [48]:
SEQ_LEN = 10
VALID_ROWS = number_of_rows - SEQ_LEN - 1
NUM_FEATURES = 3
# then we can use indexes to access rows easily
df = np.asarray(df)
# sequences will have shape: [VALID_ROWS, SEQ_LEN, NUM_FEATURES]
sequences = np.zeros((VALID_ROWS, SEQ_LEN, NUM_FEATURES), dtype=np.float32)
labels = np.zeros((VALID_ROWS, 1))
# sequences are 10 days
# label is the avg_tmp for the following day (11th)
for i in range(VALID_ROWS):
sequences[i] = df[i: i + SEQ_LEN]
labels[i] = df[i + SEQ_LEN][0]
print('-' * 20)
print('Example')
print('-' * 20)
print('sequence:')
print(sequences[0])
print('prediction:', labels[0])
In [49]:
# these values are based on the number of valid rows which is 32083
TRAIN_SIZE = 30000
EVAL_SIZE = 2073
TEST_SIZE = 10
# TODO(@monteirom): suffle
train_seq = sequences[:TRAIN_SIZE]
train_label = np.asarray(labels[:TRAIN_SIZE], dtype=np.float32)
eval_seq = sequences[TRAIN_SIZE: TRAIN_SIZE + EVAL_SIZE]
eval_label = np.asarray(labels[TRAIN_SIZE:TRAIN_SIZE + EVAL_SIZE], dtype=np.float32)
test_seq = sequences[TRAIN_SIZE + EVAL_SIZE: ]
test_label = np.asarray(labels[TRAIN_SIZE + EVAL_SIZE: ], dtype=np.float32)
print('train shape:', train_seq.shape)
print('eval shape:', eval_seq.shape)
print('test shape:', test_seq.shape)
In [50]:
# getting test labels
test_plot_data = [test_label[i][0] for i in range(TEST_SIZE)]
# plotting
sns.tsplot(test_plot_data)
plt.show()
In [51]:
BATCH_SIZE = 64
FEATURE_KEY = 'x'
SEQ_LEN_KEY = 'sequence_length'
def make_dict(x):
d = {}
d[FEATURE_KEY] = x
# [SIZE OF DATA SET, 1]
# where the second dimesion contains the sequence of each
# sequence in the data set
d[SEQ_LEN_KEY] = np.asarray(x.shape[0] * [SEQ_LEN], dtype=np.int32)
return d
# Make input function for training:
# num_epochs=None -> will cycle through input data forever
# shuffle=True -> randomize order of input data
train_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(train_seq),
y=train_label,
batch_size=BATCH_SIZE,
shuffle=True,
num_epochs=None)
# Make input function for evaluation:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(eval_seq),
y=eval_label,
batch_size=BATCH_SIZE,
shuffle=False)
# Make input function for testing:
# shuffle=False -> do not randomize input data
test_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(test_seq),
y=test_label,
batch_size=1,
shuffle=False)
In [52]:
N_OUTPUTS = 1 # 1 prediction
NUM_FEATURES = 3
def get_model_fn(rnn_cell_sizes,
label_dimension,
dnn_layer_sizes=[],
optimizer='SGD',
learning_rate=0.01):
def model_fn(features, labels, mode, params):
x = features[FEATURE_KEY]
sequence_length = features[SEQ_LEN_KEY]
# 1. configure the RNN
# Each RNN layer will consist of a LSTM cell
rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in rnn_cell_sizes]
# Construct the layers
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
outputs, _ = tf.nn.dynamic_rnn(multi_rnn_cell, x, dtype=tf.float32)
# Slice to keep only the last cell of the RNN
last_activations = rnn_common.select_last_activations(outputs,
sequence_length)
# Construct dense layers on top of the last cell of the RNN
for units in dnn_layer_sizes:
last_activations = tf.layers.dense(last_activations,
units,
activation=tf.nn.relu)
# Final dense layer for prediction
predictions = tf.layers.dense(last_activations, label_dimension)
# 2. Define the loss function for training/evaluation
loss = None
eval_metric_ops = None
train_op = None
# if predicting labels can be None
if mode != tf.estimator.ModeKeys.PREDICT:
loss = tf.losses.mean_squared_error(labels, predictions)
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(labels, predictions)
}
# 3. Define the training operation/optimizer
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=learning_rate,
optimizer=optimizer)
# 4. Create predictions
predictions_dict = {"predicted": predictions}
# 5. return ModelFnOps
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions_dict,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops)
return model_fn
In [53]:
model_fn = get_model_fn(rnn_cell_sizes=[64], # size of the hidden layers
label_dimension=1, # since is just 1 prediction
dnn_layer_sizes=[32], # size of units in the dense layers on top of the RNN
optimizer='Adam',
learning_rate=0.001)
estimator = tf.estimator.Estimator(model_fn=model_fn)
In [54]:
estimator.train(input_fn=train_input_fn, steps=10000)
Out[54]:
In [55]:
ev = estimator.evaluate(input_fn=eval_input_fn)
print(ev)
In [56]:
preds = list(estimator.predict(input_fn=test_input_fn))
predictions = []
for p in preds:
print(p)
predictions.append(p["predicted"][0])
In [57]:
# plotting real values in black
sns.tsplot(test_plot_data, color="black")
# plotting predictions in red
sns.tsplot(predictions, color="red")
plt.show()
In [ ]: