In [105]:
#!/usr/bin/env python
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# original code from: https://github.com/GoogleCloudPlatform/training-data-analyst/tree/master/blogs/timeseries
# modified by: Marianne Linhares, monteirom@google.com, May 2017
# tensorflow
import tensorflow as tf
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.layers as tflayers
from tensorflow.contrib.learn.python.learn import learn_runner
import tensorflow.contrib.metrics as metrics
import tensorflow.contrib.rnn as rnn
# Rnn common functions
from tensorflow.contrib.learn.python.learn.estimators import rnn_common
# visualization
import matplotlib.pyplot as plt
# helpers
import numpy as np
import pandas as pd
import csv
# enable tensorflow logs
tf.logging.set_verbosity(tf.logging.INFO)
We're using a weather dataset....[DESCRIBE DATA SET, HOW THE DATA WAS GENERATED AND OTHER DETAILS].
The goal is: based on features from the past days predict the avg temperature in the next day. More specifically we'll use the data from 10 days in sequence to predict the avg temperature in the next day.
In [71]:
df = pd.read_csv('weather.csv')
number_of_rows = len(df)
print('number of rows in the dataset:', number_of_rows)
print('how a row looks like:')
print(df.head(11))
print()
print("we don't the year mo da columns, so let's forget about them")
df = df[['avg_tmp', 'avg_dewp', 'avg_slp']]
print(df.head(11))
In [72]:
SEQ_LEN = 10
VALID_ROWS = number_of_rows - SEQ_LEN - 1
NUM_FEATURES = 3
# then we can use indexes to access rows easily
df = np.asarray(df)
# sequences will have shape: [VALID_ROWS, SEQ_LEN, NUM_FEATURES]
sequences = np.zeros((VALID_ROWS, SEQ_LEN, NUM_FEATURES), dtype=np.float32)
labels = np.zeros((VALID_ROWS, 1))
# if the sequence would have len < SEQ_LEN we don't want to use it
# @monteirom: but we can, just need to pass the seq_len as parameter to the dynamic RNN,
# but for now let's keep things simple
for i in range(VALID_ROWS):
sequences[i] = df[i: i + SEQ_LEN]
labels[i] = df[i + SEQ_LEN][0]
print('-' * 20)
print('Example')
print('-' * 20)
print('sequence:')
print(sequences[0])
print('prediction:', labels[0])
In [146]:
# these values are based on the number of valid rows which is 32083
TRAIN_SIZE = 30000
EVAL_SIZE = 2073
TEST_SIZE = 10
# TODO(@monteirom): suffle
train_seq = sequences[:TRAIN_SIZE]
train_label = np.asarray(labels[:TRAIN_SIZE], dtype=np.float32)
eval_seq = sequences[TRAIN_SIZE: TRAIN_SIZE + EVAL_SIZE]
eval_label = np.asarray(labels[TRAIN_SIZE:TRAIN_SIZE + EVAL_SIZE], dtype=np.float32)
test_seq = sequences[TRAIN_SIZE + EVAL_SIZE: ]
test_label = np.asarray(labels[TRAIN_SIZE + EVAL_SIZE: ], dtype=np.float32)
print('train shape:', train_seq.shape)
print('eval shape:', eval_seq.shape)
print('test shape:', test_seq.shape)
In [166]:
# getting test labels
test_plot_data = [test_label[i][0] for i in range(TEST_SIZE)]
# plotting
sns.tsplot(test_plot_data)
plt.show()
In [148]:
BATCH_SIZE = 64
FEATURE_KEY = 'x'
SEQ_LEN_KEY = 'sequence_length'
def make_dict(x):
d = {}
d[FEATURE_KEY] = x
# [SIZE OF DATA SET, 1]
# where the second dimesion contains the sequence of each
# sequence in the data set
d[SEQ_LEN_KEY] = np.asarray(x.shape[0] * [SEQ_LEN], dtype=np.int32)
return d
# Make input function for training:
# num_epochs=None -> will cycle through input data forever
# shuffle=True -> randomize order of input data
train_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(train_seq),
y=train_label,
batch_size=BATCH_SIZE,
shuffle=True,
num_epochs=None)
# Make input function for evaluation:
# shuffle=False -> do not randomize input data
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(eval_seq),
y=eval_label,
batch_size=BATCH_SIZE,
shuffle=False)
# Make input function for testing:
# shuffle=False -> do not randomize input data
test_input_fn = tf.estimator.inputs.numpy_input_fn(x=make_dict(test_seq),
y=test_label,
batch_size=1,
shuffle=False)
In [140]:
N_OUTPUTS = 1 # 1 prediction
NUM_FEATURES = 3
def get_model_fn(rnn_cell_sizes,
label_dimension,
dnn_layer_sizes=[],
optimizer='SGD',
learning_rate=0.01):
def model_fn(features, targets, mode, params):
x = features[FEATURES_KEY]
sequence_length = features[SEQ_LEN_KEY]
# 1. configure the RNN
# Each RNN layer will consist of a LSTM cell
rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in rnn_cell_sizes]
# Construct the layers
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
outputs, _ = tf.nn.dynamic_rnn(multi_rnn_cell, x, dtype=tf.float32)
# Slice to keep only the last cell of the RNN
last_activations = rnn_common.select_last_activations(outputs,
sequence_length)
# Construct dense layers on top of the last cell of the RNN
for units in dnn_layer_sizes:
last_activations = tf.layers.dense(last_activations,
units,
activation=tf.nn.relu)
# Final dense layer for prediction
predictions = tf.layers.dense(last_activations, label_dimension)
# 2. Define the loss function for training/evaluation
#print 'targets={}'.format(targets)
#print 'preds={}'.format(predictions)
loss = tf.losses.mean_squared_error(targets, predictions)
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(targets, predictions)
}
# 3. Define the training operation/optimizer
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=learning_rate,
optimizer=optimizer)
# 4. Create predictions
predictions_dict = {"predicted": predictions}
# 5. return ModelFnOps
return tflearn.ModelFnOps(
mode=mode,
predictions=predictions_dict,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops)
return model_fn
In [141]:
model_fn = get_model_fn(rnn_cell_sizes=[64], # size of the hidden layers
label_dimension=1, # since is just 1 prediction
dnn_layer_sizes=[32], # size of units in the dense layers on top of the RNN
optimizer='Adam',
learning_rate=0.001)
estimator = tf.contrib.learn.Estimator(model_fn=model_fn)
In [167]:
estimator.fit(input_fn=train_input_fn, steps=10000)
Out[167]:
In [168]:
ev = estimator.evaluate(input_fn=eval_input_fn)
print(ev)
In [169]:
preds = list(estimator.predict(input_fn=test_input_fn))
predictions = []
for p in preds:
print(p)
predictions.append(p["predicted"][0])
In [170]:
# plotting real values in black
sns.tsplot(test_plot_data, color="black")
# plotting predictions in red
sns.tsplot(predictions, color="red")
plt.show()