In [24]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy

In [25]:
# Download and save the archived data

url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    print(os.path.join(to,filename))
    print(url+filename)
    if force or not os.path.exists(os.path.join(to,filename)):
        filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
    statinfo = os.stat(os.path.join(to,filename))
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
          'Failed to verify' + filename + '. Can you get to it with a browser?')
    return filename

data_filename = maybe_download('ann_dataset1.tar', 5642240)


../data/ann_dataset1.tar
http://mrtee.europa.renci.org/~bblanton/ANN/ann_dataset1.tar
Found and verified ann_dataset1.tar

In [28]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
    extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    root = os.path.dirname(filename)
    if os.path.isdir(extract_folder) and not force:
    # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(path = root)
        tar.close()
    data_files = [
        os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
        if os.path.isdir(extract_folder)]
    return data_files
  
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)

# Load files and produce dataset
def maybe_load(file_names):
    names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
            'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
    datafile_length = 193
    dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
    for i in range(0,len(file_names)):
        a = np.loadtxt(file_names[i])
        a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
        dataset[i,:,:] = a
        if i%100 == 0:
            print("Processed %d/%d \n"%(i,len(file_names)))
    return dataset

dataset = maybe_load(data_files)
print(dataset.shape)


../data already present - Skipping extraction of ../data/ann_dataset_10points.tar.
Processed 0/324 

Processed 100/324 

Processed 200/324 

Processed 300/324 

(324, 193, 18)

In [48]:



[[  1.00000000e+00  -3.00000000e+00  -7.90000000e+01   2.80900000e+01
    9.73400000e+02   2.71500000e+01   1.10000000e+00   1.01300000e+03
    5.40000000e-03   5.10000000e-03   5.60000000e-03   5.90000000e-03
    6.80000000e-03   7.10000000e-03   1.14000000e-02   1.22000000e-02
    1.34000000e-02   1.67000000e-02]
 [  2.00000000e+00  -2.97917000e+00  -7.90000000e+01   2.81300000e+01
    9.73400000e+02   2.71900000e+01   1.10000000e+00   1.01300000e+03
    4.10000000e-03   4.80000000e-03   5.60000000e-03   6.10000000e-03
    5.80000000e-03   7.10000000e-03   1.00000000e-02   1.08000000e-02
    1.38000000e-02   1.59000000e-02]]

In [27]:
# train, validation, and test dataset percentages
train_percent = 70
valid_percent = 15
test_percent = 15

# train, validation, and test dataset indices
# test: test_start : valid_start-1
# validation: valid_start : train_start-1
# training: train_start : dataset.shape[0]
test_start = 0 
valid_start = 48 #int(test_percent/100.0*dataset.shape[0])
train_start = 48 + 48 #int((test_percent+valid_percent)/100.0*dataset.shape[0])

# Shuffle file indices
file_indices = range(dataset.shape[0])
np.random.shuffle(file_indices)

# Assign datasets
test_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(test_start, valid_start)]])
valid_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(valid_start, train_start)]])
train_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(train_start, dataset.shape[0])]])

# Save memory
#del(dataset)
print("Test dataset: "+str(test_dataset.shape))
print("Validation dataset: "+str(valid_dataset.shape))
print("Training dataset: "+str(train_dataset.shape))


Test dataset: (48, 193, 18)
Validation dataset: (48, 193, 18)
Training dataset: (228, 193, 18)

In [19]:
def accuracy_mse(predictions, outputs):
    err = predictions-outputs
    return np.mean(err*err)

def Normalize(x, means, stds):
    return (x-means)/stds

# Reshape the data and normalize

train_dataset2 = train_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32)
train_output = train_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
print("train outputs: ",train_dataset.shape)

# calculate means and stds for training dataset
input_means = [np.mean(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
input_stds = [np.std(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
output_means = [np.mean(train_output[:,i]) for i in range(train_output.shape[1])]
output_stds = [np.std(train_output[:,i]) for i in range(train_output.shape[1])]

train_dataset2 = Normalize(train_dataset2, input_means, input_stds)
#train_output = Normalize(train_output, output_means, output_stds)

print(train_dataset2.shape)
print(train_output.shape)

#plt.plot(train_dataset2[:193,:],label="input")
#plt.plot(train_output[:193,:],label="output")
#plt.ylabel("training data")
#plt.legend(loc='upper right', shadow=True)
#plt.show()

valid_dataset2 = Normalize(valid_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32), input_means, input_stds)
valid_output = valid_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#valid_output = Normalize(valid_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)

test_dataset2 = Normalize(test_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32),input_means, input_stds)
test_output = test_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#test_output = Normalize(test_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)


train outputs:  (228, 193, 18)
(44004, 6)
(44004, 10)

In [21]:
num_embeds = 2 # number of embeddings, i.e. 0 -- the original array
#print(train_dataset2[:-1,:])
#print(train_dataset2[1:,:])

def get_embeddings(dataset, num_embeds = 1):
    dataset_list = []
    if num_embeds == 0:
        return dataset
    for i in range(num_embeds):
        dataset_list.append(dataset[i:(-num_embeds+i),:])
        #dataset_list.append(dataset[num_embeds:,:])
    return np.hstack(dataset_list)

train_dataset2 = get_embeddings(train_dataset2, num_embeds)
valid_dataset2 = get_embeddings(valid_dataset2, num_embeds)
test_dataset2 = get_embeddings(test_dataset2, num_embeds)
train_output = train_output[num_embeds:,:]
valid_output = valid_output[num_embeds:,:]
test_output = test_output[num_embeds:,:]

In [20]:
def variance(x):
    return tf.reduce_mean(tf.square(x-tf.reduce_mean(x)))

def explained_var(y_true, y_predicted):
    return 1 - tf.div(variance(tf.sub(y_true,y_predicted)),variance(y_true))

input_size = train_dataset2.shape[1]
output_size = 10

# Deep ANN
batch_size = 57*193
hidden_nodes = 32 #64

num_steps = 20001
starter_learning_rate = 0.005

graph = tf.Graph()
with graph.as_default():

    x = tf.placeholder(tf.float32, shape=(None, input_size)) #train_dataset2.shape(2)
    y = tf.placeholder(tf.float32, shape=(None, output_size))
    
    y_list = tf.split(1, 10, y, name='split')
      
    yl_ = []
    # Building graph
    for o in range(10):
        #o = 5
        weights_0 = tf.Variable(tf.truncated_normal([input_size,hidden_nodes], stddev = 0.01, dtype = tf.float32))
        biases_0 = tf.Variable(tf.zeros([hidden_nodes], dtype = tf.float32))
        input_layer = tf.tanh(tf.matmul(x, weights_0) + biases_0)
        weights_1 = tf.Variable(tf.truncated_normal([hidden_nodes, 1], stddev = 0.01, dtype = tf.float32))
        biases_1 = tf.Variable(tf.zeros([1], dtype = tf.float32))
        output_ = tf.matmul(input_layer, weights_1) + biases_1
        yl_.append(output_)
        tf.add_to_collection('losses',tf.reduce_mean(tf.square(output_-y_list[o])))
    y_ = tf.pack(yl_)
    #print(y_)
    y_ = tf.transpose(tf.reshape(y_, shape=(10, -1)))
    #print(y_)
    
    losses = tf.get_collection('losses')
    loss = tf.add_n(losses, name='total_loss')/output_size
    
    global_step = tf.Variable(0.0, trainable=False)
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 0.5, staircase=False)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)    

    """
    train_prediction = loss
    valid_prediction = tf.tanh(tf.matmul(tf_valid_dataset, weights_0) + biases_0)
    valid_prediction = tf.tanh(tf.matmul(valid_prediction, weights_1) + biases_1)
    valid_prediction = tf.matmul(valid_prediction, weights_2) + biases_2
    
    test_prediction = tf.tanh(tf.matmul(tf_test_dataset, weights_0) + biases_0)
    test_prediction = tf.tanh(tf.matmul(test_prediction, weights_1) + biases_1)
    test_prediction = tf.matmul(test_prediction, weights_2) + biases_2
    """

In [21]:
def stop(data, n = 3):
    assert len(data) > n*2
    avg = sum(data)/len(data)
    for x in (data-avg)[-n:]:
        if x <= 0:
            return False
    return True

from sklearn.metrics import explained_variance_score

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    sum_l = 0
    num_l = 0
    #ev_l = [0,0,0,0,0,0,0]
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_output.shape[0] - batch_size)
        batch_data = train_dataset2[offset:(offset + batch_size), :]
        batch_output = train_output[offset:(offset + batch_size), :]
        feed_dict = {x : batch_data, y : batch_output}
        start_time = time.time()
        _, l = session.run([optimizer, loss],feed_dict=feed_dict)
        duration = time.time()-start_time
        
        sum_l += l
        num_l += 1

        if (step % 500 == 0):
            valid_loss = loss.eval(feed_dict = {x: valid_dataset2, y: valid_output})
            #print(predictions)
            #ev = explained_variance_score(y_.eval(feed_dict = {x: valid_dataset2, y: valid_output}), valid_output)
            #ev_l.append(valid_loss)
            #ev_l = ev_l[1:]
            print('Loss at step %d (%.2f op/sec): %f; validation loss: %.6f' % (
                    step, 1.0/duration, sum_l/num_l, 
                    #accuracy_mse(valid_prediction.eval(), valid_output)))
                    valid_loss))
            sum_l = 0
            num_l = 0
            #if stop(ev_l):
            #    print("Non decreasing scores, so stopping early")
            #    break
    
    feed_dict = {x: test_dataset2, y: test_output}
    predictions, test_loss = session.run([y_, loss],feed_dict=feed_dict)
    #test_loss = loss.eval(feed_dict = {x: test_dataset2, y: test_output})
    print('Test MSE: %.4f' % test_loss)
    #print('Test losses:', test_losses)
    predicted_vs_actual = np.hstack((predictions, test_output))


Initialized
Loss at step 0 (19.20 op/sec): 0.081874; validation loss: 0.082135
Loss at step 500 (34.80 op/sec): 0.044962; validation loss: 0.033111
Loss at step 1000 (31.39 op/sec): 0.024669; validation loss: 0.022864
Loss at step 1500 (33.67 op/sec): 0.019150; validation loss: 0.019426
Loss at step 2000 (33.14 op/sec): 0.016856; validation loss: 0.017833
Loss at step 2500 (33.53 op/sec): 0.015503; validation loss: 0.016565
Loss at step 3000 (30.88 op/sec): 0.014549; validation loss: 0.015684
Loss at step 3500 (31.78 op/sec): 0.013825; validation loss: 0.014854
Loss at step 4000 (32.77 op/sec): 0.013179; validation loss: 0.014086
Loss at step 4500 (29.92 op/sec): 0.012590; validation loss: 0.013451
Loss at step 5000 (33.65 op/sec): 0.012078; validation loss: 0.012920
Loss at step 5500 (32.90 op/sec): 0.011700; validation loss: 0.012581
Loss at step 6000 (33.14 op/sec): 0.011395; validation loss: 0.012253
Loss at step 6500 (32.18 op/sec): 0.011128; validation loss: 0.011977
Loss at step 7000 (33.58 op/sec): 0.010894; validation loss: 0.011771
Loss at step 7500 (34.06 op/sec): 0.010679; validation loss: 0.011523
Loss at step 8000 (33.71 op/sec): 0.010498; validation loss: 0.011326
Loss at step 8500 (32.71 op/sec): 0.010332; validation loss: 0.011180
Loss at step 9000 (33.48 op/sec): 0.010162; validation loss: 0.010939
Loss at step 9500 (33.72 op/sec): 0.010009; validation loss: 0.010760
Loss at step 10000 (33.41 op/sec): 0.009873; validation loss: 0.010639
Loss at step 10500 (32.64 op/sec): 0.009745; validation loss: 0.010450
Loss at step 11000 (33.36 op/sec): 0.009637; validation loss: 0.010378
Loss at step 11500 (32.81 op/sec): 0.009547; validation loss: 0.010307
Loss at step 12000 (32.35 op/sec): 0.009461; validation loss: 0.010210
Loss at step 12500 (32.29 op/sec): 0.009386; validation loss: 0.010144
Loss at step 13000 (31.96 op/sec): 0.009321; validation loss: 0.010114
Loss at step 13500 (30.84 op/sec): 0.009252; validation loss: 0.010005
Loss at step 14000 (33.77 op/sec): 0.009190; validation loss: 0.009988
Loss at step 14500 (32.85 op/sec): 0.009138; validation loss: 0.009969
Loss at step 15000 (30.40 op/sec): 0.009084; validation loss: 0.009881
Loss at step 15500 (32.06 op/sec): 0.009034; validation loss: 0.009870
Loss at step 16000 (34.09 op/sec): 0.008996; validation loss: 0.009873
Loss at step 16500 (32.23 op/sec): 0.008952; validation loss: 0.009781
Loss at step 17000 (32.75 op/sec): 0.008912; validation loss: 0.009780
Loss at step 17500 (33.50 op/sec): 0.008881; validation loss: 0.009776
Loss at step 18000 (33.39 op/sec): 0.008843; validation loss: 0.009698
Loss at step 18500 (32.30 op/sec): 0.008810; validation loss: 0.009708
Loss at step 19000 (30.25 op/sec): 0.008785; validation loss: 0.009710
Loss at step 19500 (32.47 op/sec): 0.008751; validation loss: 0.009638
Loss at step 20000 (32.95 op/sec): 0.008723; validation loss: 0.009649
Test MSE: 0.0102

In [23]:
over_95 = 0
for i in range(10):
    cc = np.corrcoef(predicted_vs_actual[:,i],predicted_vs_actual[:,i+10])[0,1]
    m = np.max(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
    mse = np.mean(np.sqrt(np.square(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10])))
    
    if cc >= 0.95:
        over_95+=1
    print('Point %d: Max loss: %.6f, MSE: %.6f CC: %.6f %c' % (i,m, mse, cc, 'v' if cc >= 0.95 else ' '))

i = 5
k = np.argmax(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
print(k)
max_error_case = [np.abs(predicted_vs_actual[k,i]-predicted_vs_actual[k,i+10]) for i in range(10)]
print(max_error_case)
start = (k // 193)*193
stop = start + 193
#start = 0
#stop = 20*193

print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)

for i in range(10):
    sp = fig.add_subplot(10,1,i+1)
    sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
    sp.plot(predicted_vs_actual[start:stop,i+10],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()


Point 0: Max loss: 1.240468, MSE: 0.028800 CC: 0.895165  
Point 1: Max loss: 1.165016, MSE: 0.025457 CC: 0.921425  
Point 2: Max loss: 0.863533, MSE: 0.019804 CC: 0.931636  
Point 3: Max loss: 0.732300, MSE: 0.014674 CC: 0.949629  
Point 4: Max loss: 0.956068, MSE: 0.023755 CC: 0.947756  
Point 5: Max loss: 1.324671, MSE: 0.041982 CC: 0.907157  
Point 6: Max loss: 2.073830, MSE: 0.076835 CC: 0.930429  
Point 7: Max loss: 1.744426, MSE: 0.086797 CC: 0.922877  
Point 8: Max loss: 1.519407, MSE: 0.066029 CC: 0.903478  
Point 9: Max loss: 1.887906, MSE: 0.094386 CC: 0.865807  
2275
[0.084737197, 0.080382437, 0.034475878, 0.041143581, 0.13712037, 1.3246714, 0.20506975, 0.15680939, 0.077375188, 0.27674031]
(9264, 20)

In [231]:
d = np.asarray([])

d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)

def stop(data, n = 2):
    assert len(data) > n*2
    avg = sum(data)/len(data)
    for x in (data-avg)[-n:]:
        if x <= 0:
            return False
    return True

print(stop(d))


[ 2.  1.]
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-231-59addae8a709> in <module>()
     14     return True
     15 
---> 16 print(stop(d))

<ipython-input-231-59addae8a709> in stop(data, n)
      7 
      8 def stop(data, n = 2):
----> 9     assert len(data) > n*2
     10     avg = sum(data)/len(data)
     11     for x in (data-avg)[-n:]:

AssertionError: 

In [134]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab

ev = []
for i in range(10):
    y_true = predicted_vs_actual[:,i]
    y_pred = predicted_vs_actual[:,i+10]
    ev.append(explained_variance_score(y_true, y_pred))
    
print("Explained variance: ",ev)
diff = y_true-y_pred

#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()

num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)

"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')

# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')


# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()

fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""


Explained variance:  [0.81146699190139771, 0.85759055614471436, 0.86870527267456055, 0.89061909914016724, 0.88695210218429565, 0.74401175975799561, 0.8059995174407959, 0.80061358213424683, 0.45739859342575073, 0.35861217975616455]
(1.4245720507513746, -0.030474568797174348, 0.062562678013771522)
Out[134]:
"\nnum_bins = 100\n# the histogram of the errors\nn, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)\n\n# add a normal PDF\nmu = 0\nsigma = .05\ny = mlab.normpdf(bins, mu, sigma)\nplt.plot(bins, y, 'r-')\nplt.xlabel('Smarts')\nplt.ylabel('Probability')\n\n# add Cauchy PDF\nparams = cauchy.fit(diff)\nprint(params)\ndist = cauchy(params[0], params[1])\nx = np.linspace(-2, 2, num_bins)\nplt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')\n\n\n# Tweak spacing to prevent clipping of ylabel\n#plt.subplots_adjust(left=0.15)\n#plt.show()\n\nfig = plt.figure(figsize=(10,6),dpi=80)\nplt.hist(diff, bins = 100, alpha=0.5)\nplt.show()\n"