In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy

In [3]:
# Download and save the archived data

url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    print(os.path.join(to,filename))
    print(url+filename)
    if force or not os.path.exists(os.path.join(to,filename)):
        filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
    statinfo = os.stat(os.path.join(to,filename))
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
          'Failed to verify' + filename + '. Can you get to it with a browser?')
    return filename

data_filename = maybe_download('ann_dataset1.tar', 5642240)


../data/ann_dataset1.tar
http://mrtee.europa.renci.org/~bblanton/ANN/ann_dataset1.tar
Found and verified ann_dataset1.tar

In [4]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
    extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    root = os.path.dirname(filename)
    if os.path.isdir(extract_folder) and not force:
    # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(path = root)
        tar.close()
    data_files = [
        os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
        if os.path.isdir(extract_folder)]
    return data_files
  
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)

# Load files and produce dataset
def maybe_load(file_names):
    names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
            'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
    datafile_length = 193
    dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
    for i in range(0,len(file_names)):
        a = np.loadtxt(file_names[i])
        a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
        dataset[i,:,:] = a
        if i%100 == 0:
            print("Processed %d/%d \n"%(i,len(file_names)))
    return dataset

dataset = maybe_load(data_files)
print(dataset.shape)


../data already present - Skipping extraction of ../data/ann_dataset_10points.tar.
Processed 0/324 

Processed 100/324 

Processed 200/324 

Processed 300/324 

(324, 193, 18)

In [7]:
# train, validation, and test dataset percentages
train_percent = 70
valid_percent = 15
test_percent = 15

# train, validation, and test dataset indices
# test: test_start : valid_start-1
# validation: valid_start : train_start-1
# training: train_start : dataset.shape[0]
test_start = 0 
valid_start = 48 #int(test_percent/100.0*dataset.shape[0])
train_start = 48 + 48 #int((test_percent+valid_percent)/100.0*dataset.shape[0])

# Shuffle file indices
file_indices = range(dataset.shape[0])
np.random.shuffle(file_indices)

# Assign datasets
test_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(test_start, valid_start)]])
valid_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(valid_start, train_start)]])
train_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(train_start, dataset.shape[0])]])

# Save memory
#del(dataset)
print("Test dataset: "+str(test_dataset.shape))
print("Validation dataset: "+str(valid_dataset.shape))
print("Training dataset: "+str(train_dataset.shape))


Test dataset: (48, 193, 18)
Validation dataset: (48, 193, 18)
Training dataset: (228, 193, 18)

In [44]:
location = 3

def accuracy_mse(predictions, outputs):
    err = predictions-outputs
    return np.mean(err*err)

def Normalize(x, means, stds):
    return (x-means)/stds

# Reshape the data and normalize

train_dataset2 = train_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32)
train_output = train_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)

# calculate means and stds for training dataset
input_means = [np.mean(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
input_stds = [np.std(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
output_means = [np.mean(train_output[:,i]) for i in range(train_output.shape[1])]
output_stds = [np.std(train_output[:,i]) for i in range(train_output.shape[1])]

train_dataset2 = Normalize(train_dataset2, input_means, input_stds)
#train_output = Normalize(train_output, output_means, output_stds)

print(train_dataset2.shape)
print(train_output.shape)

#plt.plot(train_dataset2[:193,:],label="input")
#plt.plot(train_output[:193,:],label="output")
#plt.ylabel("training data")
#plt.legend(loc='upper right', shadow=True)
#plt.show()

valid_dataset2 = Normalize(valid_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32), input_means, input_stds)
valid_output = valid_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)
#valid_output = Normalize(valid_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)

test_dataset2 = Normalize(test_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32),input_means, input_stds)
test_output = test_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)
#test_output = Normalize(test_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)


(44004, 6)
(44004, 1)

In [21]:
num_embeds = 2 # number of embeddings, i.e. 0 -- the original array
#print(train_dataset2[:-1,:])
#print(train_dataset2[1:,:])

def get_embeddings(dataset, num_embeds = 1):
    dataset_list = []
    if num_embeds == 0:
        return dataset
    for i in range(num_embeds):
        dataset_list.append(dataset[i:(-num_embeds+i),:])
        #dataset_list.append(dataset[num_embeds:,:])
    return np.hstack(dataset_list)

train_dataset2 = get_embeddings(train_dataset2, num_embeds)
valid_dataset2 = get_embeddings(valid_dataset2, num_embeds)
test_dataset2 = get_embeddings(test_dataset2, num_embeds)
train_output = train_output[num_embeds:,:]
valid_output = valid_output[num_embeds:,:]
test_output = test_output[num_embeds:,:]

In [45]:
def variance(x):
    return tf.reduce_mean(tf.square(x-tf.reduce_mean(x)))

def explained_var(y_true, y_predicted):
    return 1 - tf.div(variance(tf.sub(y_true,y_predicted)),variance(y_true))

input_size = train_dataset2.shape[1]
output_size = 1

# Deep ANN
batch_size = 57*193
hidden_nodes = 32 #64

num_steps = 30001
starter_learning_rate = 0.02

graph = tf.Graph()
with graph.as_default():

    x = tf.placeholder(tf.float32, shape=(None, input_size)) #train_dataset2.shape(2)
    y = tf.placeholder(tf.float32, shape=(None, output_size))
    
    weights_0 = tf.Variable(tf.truncated_normal([input_size,hidden_nodes], stddev = 0.1, dtype = tf.float32))
    biases_0 = tf.Variable(tf.zeros([hidden_nodes], dtype = tf.float32))
    input_layer = tf.tanh(tf.matmul(x, weights_0) + biases_0)
    weights_1 = tf.Variable(tf.truncated_normal([hidden_nodes, output_size], stddev = 0.1, dtype = tf.float32))
    biases_1 = tf.Variable(tf.zeros([output_size], dtype = tf.float32))
    y_ = tf.matmul(input_layer, weights_1) + biases_1
    
    regularizers = sum([tf.nn.l2_loss(v) for v in tf.all_variables()])
    
    loss = tf.reduce_mean(tf.square(y_-y))
    loss += 1e-5 * regularizers
    
    global_step = tf.Variable(0.0, trainable=False)
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 0.5, staircase=False)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)    

    """
    train_prediction = loss
    valid_prediction = tf.tanh(tf.matmul(tf_valid_dataset, weights_0) + biases_0)
    valid_prediction = tf.tanh(tf.matmul(valid_prediction, weights_1) + biases_1)
    valid_prediction = tf.matmul(valid_prediction, weights_2) + biases_2
    
    test_prediction = tf.tanh(tf.matmul(tf_test_dataset, weights_0) + biases_0)
    test_prediction = tf.tanh(tf.matmul(test_prediction, weights_1) + biases_1)
    test_prediction = tf.matmul(test_prediction, weights_2) + biases_2
    """

In [46]:
def stop(data, n = 3):
    assert len(data) > n*2
    avg = sum(data)/len(data)
    for x in (data-avg)[-n:]:
        if x <= 0:
            return False
    return True

from sklearn.metrics import explained_variance_score

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    sum_l = 0
    num_l = 0
    #ev_l = [0,0,0,0,0,0,0]
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_output.shape[0] - batch_size)
        batch_data = train_dataset2[offset:(offset + batch_size), :]
        batch_output = train_output[offset:(offset + batch_size), :]
        feed_dict = {x : batch_data, y : batch_output}
        start_time = time.time()
        _, l, r = session.run([optimizer, loss, regularizers],feed_dict=feed_dict)
        duration = time.time()-start_time
        
        sum_l += l
        num_l += 1

        if (step % 500 == 0):
            valid_loss = loss.eval(feed_dict = {x: valid_dataset2, y: valid_output})
            #print(predictions)
            #ev = explained_variance_score(y_.eval(feed_dict = {x: valid_dataset2, y: valid_output}), valid_output)
            #ev_l.append(valid_loss)
            #ev_l = ev_l[1:]
            print('Loss at step %d (%.2f op/sec): %.6f (%.6f); validation loss: %.6f' % (
                    step, 1.0/duration, sum_l/num_l, r, 
                    #accuracy_mse(valid_prediction.eval(), valid_output)))
                    valid_loss))
            sum_l = 0
            num_l = 0
            #if stop(ev_l):
            #    print("Non decreasing scores, so stopping early")
            #    break
    
    feed_dict = {x: test_dataset2, y: test_output}
    predictions, test_loss = session.run([y_, loss],feed_dict=feed_dict)
    #test_loss = loss.eval(feed_dict = {x: test_dataset2, y: test_output})
    print('Test MSE: %.4f' % test_loss)
    #print('Test losses:', test_losses)
    predicted_vs_actual = np.hstack((predictions, test_output))


Initialized
Loss at step 0 (115.94 op/sec): 0.019022 (0.797307); validation loss: 0.053324
Loss at step 500 (225.79 op/sec): 0.004713 (7.307686); validation loss: 0.002536
Loss at step 1000 (220.80 op/sec): 0.002004 (10.933863); validation loss: 0.002247
Loss at step 1500 (231.86 op/sec): 0.001839 (13.539793); validation loss: 0.002352
Loss at step 2000 (239.29 op/sec): 0.001769 (15.379843); validation loss: 0.002154
Loss at step 2500 (216.96 op/sec): 0.001743 (16.507231); validation loss: 0.002179
Loss at step 3000 (224.05 op/sec): 0.001735 (17.347557); validation loss: 0.002181
Loss at step 3500 (227.27 op/sec): 0.001721 (18.054956); validation loss: 0.002070
Loss at step 4000 (232.45 op/sec): 0.001716 (18.715330); validation loss: 0.002135
Loss at step 4500 (236.97 op/sec): 0.001704 (19.223320); validation loss: 0.002055
Loss at step 5000 (220.46 op/sec): 0.001692 (19.604223); validation loss: 0.002209
Loss at step 5500 (238.72 op/sec): 0.001698 (19.824041); validation loss: 0.002075
Loss at step 6000 (232.50 op/sec): 0.002212 (20.197359); validation loss: 0.002299
Loss at step 6500 (219.78 op/sec): 0.001682 (20.404509); validation loss: 0.002085
Loss at step 7000 (241.43 op/sec): 0.001645 (20.338596); validation loss: 0.002111
Loss at step 7500 (221.92 op/sec): 0.001657 (20.371290); validation loss: 0.002094
Loss at step 8000 (217.21 op/sec): 0.001671 (20.442711); validation loss: 0.002081
Loss at step 8500 (231.54 op/sec): 0.001675 (20.635008); validation loss: 0.002301
Loss at step 9000 (218.86 op/sec): 0.001688 (20.939100); validation loss: 0.002218
Loss at step 9500 (224.82 op/sec): 0.001678 (21.281950); validation loss: 0.002084
Loss at step 10000 (220.50 op/sec): 0.001672 (21.631639); validation loss: 0.002101
Loss at step 10500 (220.02 op/sec): 0.001665 (22.040117); validation loss: 0.002127
Loss at step 11000 (260.42 op/sec): 0.001655 (22.756372); validation loss: 0.002069
Loss at step 11500 (237.76 op/sec): 0.001632 (23.378948); validation loss: 0.002007
Loss at step 12000 (237.41 op/sec): 0.001621 (23.664389); validation loss: 0.002058
Loss at step 12500 (219.30 op/sec): 0.001609 (23.923544); validation loss: 0.002123
Loss at step 13000 (217.11 op/sec): 0.001604 (24.327360); validation loss: 0.002084
Loss at step 13500 (227.89 op/sec): 0.001597 (24.762260); validation loss: 0.002104
Loss at step 14000 (189.32 op/sec): 0.001798 (25.369978); validation loss: 0.001990
Loss at step 14500 (208.11 op/sec): 0.001533 (25.393507); validation loss: 0.001983
Loss at step 15000 (231.42 op/sec): 0.001535 (25.393419); validation loss: 0.002076
Loss at step 15500 (237.03 op/sec): 0.001588 (25.568277); validation loss: 0.002031
Loss at step 16000 (218.92 op/sec): 0.001530 (25.489130); validation loss: 0.002007
Loss at step 16500 (225.27 op/sec): 0.001524 (25.402985); validation loss: 0.002026
Loss at step 17000 (224.57 op/sec): 0.001520 (25.279354); validation loss: 0.002097
Loss at step 17500 (226.55 op/sec): 0.001494 (25.169559); validation loss: 0.002088
Loss at step 18000 (223.21 op/sec): 0.001474 (25.061171); validation loss: 0.002106
Loss at step 18500 (228.57 op/sec): 0.001457 (24.955902); validation loss: 0.002055
Loss at step 19000 (212.59 op/sec): 0.001450 (24.927488); validation loss: 0.001961
Loss at step 19500 (213.31 op/sec): 0.001441 (24.936466); validation loss: 0.001994
Loss at step 20000 (223.86 op/sec): 0.001434 (24.979736); validation loss: 0.001947
Loss at step 20500 (208.29 op/sec): 0.001864 (25.399202); validation loss: 0.002553
Loss at step 21000 (218.62 op/sec): 0.001402 (25.262590); validation loss: 0.001943
Loss at step 21500 (216.55 op/sec): 0.001375 (25.192738); validation loss: 0.001928
Loss at step 22000 (209.73 op/sec): 0.001379 (25.149221); validation loss: 0.001917
Loss at step 22500 (210.93 op/sec): 0.001398 (25.198700); validation loss: 0.002035
Loss at step 23000 (215.42 op/sec): 0.001433 (25.326996); validation loss: 0.001919
Loss at step 23500 (221.24 op/sec): 0.001405 (25.359001); validation loss: 0.001959
Loss at step 24000 (214.69 op/sec): 0.001416 (25.378788); validation loss: 0.001953
Loss at step 24500 (219.72 op/sec): 0.001404 (25.369316); validation loss: 0.002079
Loss at step 25000 (213.13 op/sec): 0.001407 (25.349968); validation loss: 0.001991
Loss at step 25500 (207.42 op/sec): 0.001402 (25.312408); validation loss: 0.002006
Loss at step 26000 (220.66 op/sec): 0.001401 (25.273981); validation loss: 0.002048
Loss at step 26500 (219.44 op/sec): 0.001397 (25.224693); validation loss: 0.001942
Loss at step 27000 (206.70 op/sec): 0.001395 (25.179367); validation loss: 0.001960
Loss at step 27500 (218.82 op/sec): 0.001393 (25.122702); validation loss: 0.001910
Loss at step 28000 (221.15 op/sec): 0.001387 (25.063507); validation loss: 0.001958
Loss at step 28500 (207.91 op/sec): 0.001389 (25.021704); validation loss: 0.001924
Loss at step 29000 (221.92 op/sec): 0.001381 (24.964090); validation loss: 0.001917
Loss at step 29500 (209.82 op/sec): 0.001371 (24.882746); validation loss: 0.001915
Loss at step 30000 (226.13 op/sec): 0.001469 (24.984497); validation loss: 0.001915
Test MSE: 0.0013

In [47]:
over_95 = 0
print(predicted_vs_actual.shape)
cc = np.corrcoef(predicted_vs_actual[:,0],predicted_vs_actual[:,1])[0,1]
m = np.max(np.abs(predicted_vs_actual[:,0]-predicted_vs_actual[:,1]))
mse = np.sqrt(np.mean(np.square(predicted_vs_actual[:,0]-predicted_vs_actual[:,1])))
print("Results for location %d: R^2:%.4f, MSE:%.6f, Max error: %.4f"%(location, cc,mse,m))
k = np.argmax(np.abs(predicted_vs_actual[:,0]-predicted_vs_actual[:,1]))
print(k)

start = (k // 193)*193
stop = start + 193
#start = 0 
#stop = 20*193

print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)

for i in range(1):
    sp = fig.add_subplot(1,1,i+1)
    sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
    sp.plot(predicted_vs_actual[start:stop,i+1],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()


(9264, 2)
Results for location 3: R^2:0.9640, MSE:0.031761, Max error: 0.4646
1881
(9264, 2)

In [231]:
d = np.asarray([])

d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)

def stop(data, n = 2):
    assert len(data) > n*2
    avg = sum(data)/len(data)
    for x in (data-avg)[-n:]:
        if x <= 0:
            return False
    return True

print(stop(d))


[ 2.  1.]
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-231-59addae8a709> in <module>()
     14     return True
     15 
---> 16 print(stop(d))

<ipython-input-231-59addae8a709> in stop(data, n)
      7 
      8 def stop(data, n = 2):
----> 9     assert len(data) > n*2
     10     avg = sum(data)/len(data)
     11     for x in (data-avg)[-n:]:

AssertionError: 

In [134]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab

ev = []
for i in range(10):
    y_true = predicted_vs_actual[:,i]
    y_pred = predicted_vs_actual[:,i+10]
    ev.append(explained_variance_score(y_true, y_pred))
    
print("Explained variance: ",ev)
diff = y_true-y_pred

#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()

num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)

"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')

# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')


# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()

fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""


Explained variance:  [0.81146699190139771, 0.85759055614471436, 0.86870527267456055, 0.89061909914016724, 0.88695210218429565, 0.74401175975799561, 0.8059995174407959, 0.80061358213424683, 0.45739859342575073, 0.35861217975616455]
(1.4245720507513746, -0.030474568797174348, 0.062562678013771522)
Out[134]:
"\nnum_bins = 100\n# the histogram of the errors\nn, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)\n\n# add a normal PDF\nmu = 0\nsigma = .05\ny = mlab.normpdf(bins, mu, sigma)\nplt.plot(bins, y, 'r-')\nplt.xlabel('Smarts')\nplt.ylabel('Probability')\n\n# add Cauchy PDF\nparams = cauchy.fit(diff)\nprint(params)\ndist = cauchy(params[0], params[1])\nx = np.linspace(-2, 2, num_bins)\nplt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')\n\n\n# Tweak spacing to prevent clipping of ylabel\n#plt.subplots_adjust(left=0.15)\n#plt.show()\n\nfig = plt.figure(figsize=(10,6),dpi=80)\nplt.hist(diff, bins = 100, alpha=0.5)\nplt.show()\n"