In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy
import math

In [2]:
# Download and save the archived data

url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    print(os.path.join(to,filename))
    print(url+filename)
    if force or not os.path.exists(os.path.join(to,filename)):
        filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
    statinfo = os.stat(os.path.join(to,filename))
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
          'Failed to verify' + filename + '. Can you get to it with a browser?')
    return filename

data_filename = maybe_download('ann_dataset1.tar', 5642240)


../data/ann_dataset1.tar
http://mrtee.europa.renci.org/~bblanton/ANN/ann_dataset1.tar
Found and verified ann_dataset1.tar

In [3]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
    extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    root = os.path.dirname(filename)
    if os.path.isdir(extract_folder) and not force:
    # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(path = root)
        tar.close()
    data_files = [
        os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
        if os.path.isdir(extract_folder)]
    return data_files
  
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)

# Load files and produce dataset
def maybe_load(file_names):
    names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
            'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
    datafile_length = 193
    dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
    for i in range(0,len(file_names)):
        a = np.loadtxt(file_names[i])
        a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
        dataset[i,:,:] = a
        if i%100 == 0:
            print("Processed %d/%d \n"%(i,len(file_names)))
    return dataset

dataset = maybe_load(data_files)
print(dataset.shape)


../data already present - Skipping extraction of ../data/ann_dataset_10points.tar.
Processed 0/324 

Processed 100/324 

Processed 200/324 

Processed 300/324 

(324, 193, 18)

In [4]:
# train, validation, and test dataset percentages
train_percent = 70
valid_percent = 15
test_percent = 15

# train, validation, and test dataset indices
# test: test_start : valid_start-1
# validation: valid_start : train_start-1
# training: train_start : dataset.shape[0]
test_start = 0 
valid_start = 48 #int(test_percent/100.0*dataset.shape[0])
train_start = 48 + 48 #int((test_percent+valid_percent)/100.0*dataset.shape[0])

# Shuffle file indices
file_indices = range(dataset.shape[0])
np.random.shuffle(file_indices)

# Assign datasets
test_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(test_start, valid_start)]])
valid_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(valid_start, train_start)]])
train_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(train_start, dataset.shape[0])]])

# Save memory
#del(dataset)
print("Test dataset: "+str(test_dataset.shape))
print("Validation dataset: "+str(valid_dataset.shape))
print("Training dataset: "+str(train_dataset.shape))


Test dataset: (48, 193, 18)
Validation dataset: (48, 193, 18)
Training dataset: (228, 193, 18)

In [5]:
def accuracy_mse(predictions, outputs):
    err = predictions-outputs
    return np.mean(err*err)

def Normalize(x, means, stds):
    return (x-means)/stds

# Reshape the data and normalize

train_dataset2 = train_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32)
train_output = train_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
print("train outputs: ",train_dataset.shape)

# calculate means and stds for training dataset
input_means = [np.mean(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
input_stds = [np.std(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
output_means = [np.mean(train_output[:,i]) for i in range(train_output.shape[1])]
output_stds = [np.std(train_output[:,i]) for i in range(train_output.shape[1])]

train_dataset2 = Normalize(train_dataset2, input_means, input_stds)
#train_output = Normalize(train_output, output_means, output_stds)

print(train_dataset2.shape)
print(train_output.shape)

#plt.plot(train_dataset2[:193,:],label="input")
#plt.plot(train_output[:193,:],label="output")
#plt.ylabel("training data")
#plt.legend(loc='upper right', shadow=True)
#plt.show()

valid_dataset2 = Normalize(valid_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32), input_means, input_stds)
valid_output = valid_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#valid_output = Normalize(valid_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)

test_dataset2 = Normalize(test_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32),input_means, input_stds)
test_output = test_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#test_output = Normalize(test_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)


train outputs:  (228, 193, 18)
(44004, 6)
(44004, 10)

In [69]:
batch_size=40
num_unrollings=10
"""
suppose there is a dataset d of the format:
x11, x21, x31, ..., xK1, y11, y21, ..., yM1
x12, x22, x32, ..., xK2, y12, y22, ..., yM2
...
x1N, x2N, x3N, ..., xKN, y1N, y2N, ..., yMN

The batch generator return batches as tuples of two components: inputs and outputs

inputs batches (batch_size = 3, num_unrollings = 4)
[[x11,...,xK1], [x1i,...,xKi], [x1j,...,xKj]], <- the first batch
[[x12,...,xK2], [x1i+1,...,xKi+1], [x1j+1,...,xKj+1]], <- the second batch
[[x13,...,xK3], [x1i+2,...,xKi+2], [x1j+2,...,xKj+2]] <- the third batch
[[x14,...,xK4], [x1i+3,...,xKi+3], [x1j+3,...,xKj+3]] <- the fourth batch

indices i and j are calculated based on cursors
"""

class BatchGenerator(object):
    def __init__(self, data, outs_index, batch_size, num_unrollings):
        """
        Creates a batch generator
        data -- the dataset
        outs_index -- index of the first outputs component
        batch_size -- how many samples in each batch. Note the samples are NOT sequential in time!
        num_unrollings -- how many batches to return. The batches are sequential in time
        """
        self._data = data # the complete dataset
        self._data_size = data.shape[0] # how many samples in the dataset
        self._data_width = data.shape[1] # how many components in both inputs and outputs
        self._outs_index = outs_index # where the outputs start
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._data_size // self._batch_size 
        self._cursor = [offset * segment for offset in range(self._batch_size)] # starting points for each batch
        self._last_batch = self._next_batch() # generate and save the first batch
  
    def _next_batch(self):
        """
        Generate a single batch from the current cursor position in the data.
        Returns a tuple (inputs_batch,outputs_batch)
        """
        batch = np.zeros(shape=(self._batch_size, self._data_width), dtype = np.float) # prepare the batch array
        for b in range(self._batch_size): # cursors are indices where each data sample in the batch starts
            batch[b] = self._data[self._cursor[b],:] # copy the data
            self._cursor[b] = (self._cursor[b] + 1) % (self._data_size)
        return (batch[:,:self._outs_index],batch[:,self._outs_index:])
  
    def next(self):
        """
        Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings-1 new ones.
        """
        # make sure that the cursors stay within range
        self._cursor = [c%(self._data_size-self._num_unrollings) for c in self._cursor] 
        
        batches = [self._last_batch] # use the last batch as the first in the list
        for step in range(self._num_unrollings-1): # we only need _num_unrollings-1 new batches
            batches.append(self._next_batch())
        self._last_batch = batches[-1] # save the last batch to be reused next time
        return batches
    
d = np.column_stack((train_dataset2,train_output))
train_batches = BatchGenerator(data = d, outs_index = train_dataset2.shape[1], 
                               batch_size = batch_size, num_unrollings = num_unrollings)

In [76]:
num_nodes = 16
num_steps = 201
start_learning_rate = 0.02
rate_coeff = 0.5

input_size = 6
output_size = 10

graph = tf.Graph()
with graph.as_default():
  
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([input_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Regression weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, output_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([output_size]))
  
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state

    # Prepare placeholders for inputs and outputs
    # There is a total of 2*num_unrollings placeholders need to be fitted in the ANN
    # identified by train_inputs and train_outputs lists
    train_inputs = list()
    train_outputs = list()
    for _ in range(num_unrollings):
        train_inputs.append(
          tf.placeholder(tf.float32, shape=[batch_size, input_size])) #shape=[batch_size,1]))
        train_outputs.append(
          tf.placeholder(tf.float32, shape=[batch_size, output_size])) #shape=[batch_size,1]))
    
    # Unrolled LSTM loop.
    outputs = list() # list of outputs
    output = saved_output # recall the last saved output
    state = saved_state # recall the last saved state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)
    
    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output), 
                                  saved_state.assign(state)]):
        y = tf.matmul(tf.concat(0,outputs), w)+b
        loss = tf.reduce_mean(tf.square(y - tf.concat(0,train_outputs)))
          
    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, num_steps, rate_coeff, staircase=False)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1,input_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.xw_plus_b(sample_output, w, b)
    #run_metadata = tf.RunMetadata()

In [80]:
summary_frequency = 200

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings):
            feed_dict[train_inputs[i]] = np.reshape(batches[i][0],(batch_size,input_size))
            feed_dict[train_outputs[i]] = np.reshape(batches[i][1],(batch_size,output_size))
        _, l, lr = session.run([optimizer, loss, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency

            # calculate losses at validation dataset
            reset_sample_state.run()
            predictions = np.zeros(shape=valid_output.shape)
            for i in range(valid_dataset2.shape[0]):
                predictions[i] = sample_prediction.eval(
                    {sample_input: np.reshape(valid_dataset2[i,:],(1,input_size))})
            valid_mse = np.mean(np.square(predictions-valid_output))
            print('Step %d: training loss: %.5f, validation loss: %.5f, learning rate: %.6f' % (step, valid_mse, mean_loss, lr))
            mean_loss = 0
                 
    print('=' * 80)
    reset_sample_state.run()
    predictions = np.zeros(shape=test_output.shape)
    for i in range(test_dataset2.shape[0]):
        predictions[i] = sample_prediction.eval({sample_input: np.reshape(test_dataset2[i,:],(1,input_size))})
    test_mse = np.mean(np.square(predictions-test_output))
    print('Training complete. Test MSE: %.5f'%test_mse)
    for out_no in range(predictions.shape[1]):
    #for out_no in range(0,FLAGS.output_vars):
        test_cc = np.corrcoef(predictions[:,out_no], test_output[:,out_no])[0,1]
        test_mse = ((predictions[:,out_no] - test_output[:,out_no]) ** 2).mean()
        print("Location %d: CC: %.4f, MSE: %.6f"%(out_no,test_cc,test_mse))


Initialized
Step 0: training loss: 0.08843, validation loss: 0.09291, learning rate: 0.020000
Step 200: training loss: 0.03073, validation loss: 0.03985, learning rate: 0.010035
================================================================================
Training complete. Test MSE: 0.02387
Location 0: CC: 0.8256, MSE: 0.005284
Location 1: CC: 0.8233, MSE: 0.004987
Location 2: CC: 0.7993, MSE: 0.004193
Location 3: CC: 0.8607, MSE: 0.003221
Location 4: CC: 0.8386, MSE: 0.008395
Location 5: CC: 0.7635, MSE: 0.011847
Location 6: CC: 0.8378, MSE: 0.032109
Location 7: CC: 0.8424, MSE: 0.052114
Location 8: CC: 0.7235, MSE: 0.041725
Location 9: CC: 0.7485, MSE: 0.074821

In [75]:
#cc = np.corrcoef(test_output[:],predictions[:])[0,1]
#print(cc)
start = 0
stop = 48*193

fig = plt.figure(figsize=(10, 6), dpi=80)
for i in range(10):
    sp = fig.add_subplot(10,1,i+1)
    if i <= 4:
        sp.set_ylim([-0.5, 3.0])
    else:
        sp.set_ylim([-0.5, 3.5])
    sp.plot(predictions[start:stop,i],color="blue", linewidth=1.5, linestyle="-", label="prediction")
    sp.plot(test_output[start:stop,i],color="red", linewidth=1.5, linestyle="-", label="observation")
#plt.legend(loc='upper right')
plt.show()

In [ ]: