In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy

In [2]:
# Download and save the archived data

url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"

def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    print(os.path.join(to,filename))
    print(url+filename)
    if force or not os.path.exists(os.path.join(to,filename)):
        filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
    statinfo = os.stat(os.path.join(to,filename))
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
          'Failed to verify' + filename + '. Can you get to it with a browser?')
    return filename

data_filename = maybe_download('ann_dataset1.tar', 5642240)


../data/ann_dataset1.tar
http://mrtee.europa.renci.org/~bblanton/ANN/ann_dataset1.tar
Found and verified ann_dataset1.tar

In [7]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
    extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    root = os.path.dirname(filename)
    if os.path.isdir(extract_folder) and not force:
    # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(path = root)
        tar.close()
    data_files = [
        os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
        if os.path.isdir(extract_folder)]
    return data_files
  
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)

# Load files and produce dataset
def maybe_load(file_names):
    names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
            'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
    datafile_length = 193
    dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
    for i in range(0,len(file_names)):
        a = np.loadtxt(file_names[i])
        a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
        dataset[i,:,:] = a
        if i%100 == 0:
            print("Processed %d/%d \n"%(i,len(file_names)))
    return dataset

dataset = maybe_load(data_files)
print(dataset.shape)


../data already present - Skipping extraction of ../data/ann_dataset_10points.tar.
Processed 0/324 

Processed 100/324 

Processed 200/324 

Processed 300/324 

(324, 193, 18)

In [157]:
"""
We need to convert the dataset into b(x) = [1 x1 x2 ... x6 x1^2 x1x2 x1x3 ... x6^2]
"""
def Normalize(x, means, stds):
    return (x-means)/stds

flat_dataset = np.reshape(dataset,(-1,18)) #
inputs_only = flat_dataset[:,1:7]
outputs_only = flat_dataset[:,8+9:9+9]

input_means = [np.mean(inputs_only) for i in range(inputs_only.shape[1])]
input_stds = [np.std(inputs_only) for i in range(inputs_only.shape[1])]
inputs_only = (inputs_only-input_means)/input_stds

inputs_quadratic = np.reshape(np.ones(inputs_only.shape[0]),(-1,1))
inputs_quadratic = np.hstack((inputs_quadratic, inputs_only))

for j in range(6):
    for k in range(j,6):
        ds = np.reshape(np.multiply(inputs_only[:,i],inputs_only[:,j]),(-1,1))
        inputs_quadratic = np.hstack((inputs_quadratic, ds))

test_start = 0 
valid_start = 48*193 #int(test_percent/100.0*dataset.shape[0])
train_start = (48+48)*193 #int((test_percent+valid_percent)/100.0*dataset.shape[0])

test_dataset = np.hstack((inputs_quadratic[test_start:valid_start,:],outputs_only[test_start:valid_start,:]))
valid_dataset = np.hstack((inputs_quadratic[valid_start:train_start,:],outputs_only[valid_start:train_start,:]))
train_dataset = np.hstack((inputs_quadratic[train_start:,:],outputs_only[train_start:,:]))

In [217]:
print(train_dataset.shape)
B = train_dataset
x = test_dataset[234,1:7] # grab a random point from the test dataset
d = train_dataset[:,1:7]-x
print(d.shape)


(44004, 29)
(44004, 6)

In [207]:
num_steps = 1001
starter_learning_rate = 0.001

graph = tf.Graph()
with graph.as_default(): 

    support_x = tf.placeholder(tf.float32, shape=(None, 28)) #set of support vectors, i.e. initially all training set
    support_y_ = tf.placeholder(tf.float32, shape=(None, 1)) #set of support vectors outs
    
    x = tf.placeholder(tf.float32, shape=(None, 28)) #set of true vectors, i.e. initially all training set
    y_ = tf.placeholder(tf.float32, shape=(None, 1)) #set of true vectors outs
    
    #d = 
    """
    # coefficients a
    a_matrix = tf.Variable(tf.truncated_normal([28,1], stddev = 0.1, dtype = tf.float32))
        
    # calculate outputs at support vectors
    support_y = tf.matmul(support_x,a_matrix)
    
    # support vectors weights
    w_matrix = tf.Variable(tf.truncated_normal([batch_size,1], stddev = 0.1, dtype = tf.float32))
    
    # calculate losses at support vectors to optimize a_matrix and w_matrix
    weighted_diff = tf.matmul(tf.transpose(tf.square(support_y-support_y_)),tf.abs(w_matrix))
    loss = tf.reduce_mean(weighted_diff)
    
    global_step = tf.Variable(0.0, trainable=False)
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 1.0, staircase=False)
    
     # Create ADAM optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)

    # Calculate gradients and apply them
    grads, v = zip(*optimizer.compute_gradients(loss))
    grads, _ = tf.clip_by_global_norm(grads, 1.25)
    apply_gradient_op = optimizer.apply_gradients(zip(grads,v), global_step = global_step)
    """

In [208]:
from sklearn.metrics import explained_variance_score

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()

    print('Initialized')
    for step in range(num_steps):
        #offset = (step * batch_size) % (train_output.shape[0] - batch_size)
        #batch_data = train_dataset[offset:(offset + batch_size),:28].astype(np.float32)
        #batch_output = train_dataset[offset:(offset + batch_size), 28:].astype(np.float32)
        batch_data = train_dataset[:,:28].astype(np.float32)
        batch_output = train_dataset[:, 28:].astype(np.float32)

        feed_dict = {x : batch_data, y_ : batch_output}
        start_time = time.time()
        _, l = session.run([apply_gradient_op, loss],feed_dict=feed_dict)

        duration = time.time()-start_time
        
        if (step % 50 == 0):
            #valid_loss = loss.eval(feed_dict = {x: valid_dataset[:,:28], y_: valid_dataset[:,28:]})
            valid_loss = 0
            #print(predictions)
            #ev = explained_variance_score(y_.eval(feed_dict = {x: valid_dataset2, y: valid_output}), valid_output)
            #ev_l.append(valid_loss)
            #ev_l = ev_l[1:]
            print('Loss at step %d (%.2f op/sec): %f; validation loss: %.6f' % (
                    step, 1.0/duration, l, 
                    #accuracy_mse(valid_prediction.eval(), valid_output)))
                    valid_loss))
            #if stop(ev_l):
            #    print("Non decreasing scores, so stopping early")
            #    break
    
    #feed_dict = {x: test_dataset[:,:28],y_: test_dataset[:,28:]}
    #predictions, test_loss = session.run([y, loss],feed_dict=feed_dict)
    #test_loss = loss.eval(feed_dict = {x: test_dataset2, y: test_output})
    #print('Test MSE: %.4f' % test_loss)
    #print('Test losses:', test_losses)
    #predicted_vs_actual = np.hstack((predictions, test_output))


Initialized
Loss at step 0 (85.79 op/sec): 536.820801; validation loss: 0.000000
Loss at step 50 (130.91 op/sec): 165.063232; validation loss: 0.000000
Loss at step 100 (118.88 op/sec): 34.321072; validation loss: 0.000000
Loss at step 150 (122.64 op/sec): 3.051094; validation loss: 0.000000
Loss at step 200 (120.99 op/sec): 0.906050; validation loss: 0.000000
Loss at step 250 (121.66 op/sec): 0.809061; validation loss: 0.000000
Loss at step 300 (125.80 op/sec): 0.760943; validation loss: 0.000000
Loss at step 350 (118.82 op/sec): 0.772881; validation loss: 0.000000
Loss at step 400 (124.52 op/sec): 0.763551; validation loss: 0.000000
Loss at step 450 (133.23 op/sec): 0.739514; validation loss: 0.000000
Loss at step 500 (119.67 op/sec): 0.741241; validation loss: 0.000000
Loss at step 550 (109.96 op/sec): 0.751168; validation loss: 0.000000
Loss at step 600 (124.25 op/sec): 0.736728; validation loss: 0.000000
Loss at step 650 (124.92 op/sec): 0.736312; validation loss: 0.000000
Loss at step 700 (116.73 op/sec): 0.712569; validation loss: 0.000000
Loss at step 750 (131.88 op/sec): 0.709970; validation loss: 0.000000
Loss at step 800 (123.09 op/sec): 0.739903; validation loss: 0.000000
Loss at step 850 (119.97 op/sec): 0.719569; validation loss: 0.000000
Loss at step 900 (121.46 op/sec): 0.718824; validation loss: 0.000000
Loss at step 950 (123.51 op/sec): 0.713745; validation loss: 0.000000
Loss at step 1000 (127.16 op/sec): 0.709675; validation loss: 0.000000

In [156]:
over_95 = 0
for i in range(10):
    cc = np.corrcoef(predicted_vs_actual[:,i],predicted_vs_actual[:,i+10])[0,1]
    m = np.max(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
    mse = np.mean(np.sqrt(np.square(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10])))
    
    if cc >= 0.95:
        over_95+=1
    print('Point %d: Max loss: %.6f, MSE: %.6f CC: %.6f %c' % (i,m, mse, cc, 'v' if cc >= 0.95 else ' '))

i = 5
k = np.argmax(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
print(k)
max_error_case = [np.abs(predicted_vs_actual[k,i]-predicted_vs_actual[k,i+10]) for i in range(10)]
print(max_error_case)
start = (k // 193)*193
stop = start + 193
start = 0
stop = 20*193

print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)

for i in range(10):
    sp = fig.add_subplot(10,1,i+10)
    sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
    sp.plot(predicted_vs_actual[start:stop,i+10],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()


Point 0: Max loss: 5.530039, MSE: 0.206465 CC: 0.208662  
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-156-5ac3638afac9> in <module>()
      1 over_95 = 0
      2 for i in range(10):
----> 3     cc = np.corrcoef(predicted_vs_actual[:,i],predicted_vs_actual[:,i+10])[0,1]
      4     m = np.max(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
      5     mse = np.mean(np.sqrt(np.square(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10])))

IndexError: index 11 is out of bounds for axis 1 with size 11

In [231]:
d = np.asarray([])

d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)

def stop(data, n = 2):
    assert len(data) > n*2
    avg = sum(data)/len(data)
    for x in (data-avg)[-n:]:
        if x <= 0:
            return False
    return True

print(stop(d))


[ 2.  1.]
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-231-59addae8a709> in <module>()
     14     return True
     15 
---> 16 print(stop(d))

<ipython-input-231-59addae8a709> in stop(data, n)
      7 
      8 def stop(data, n = 2):
----> 9     assert len(data) > n*2
     10     avg = sum(data)/len(data)
     11     for x in (data-avg)[-n:]:

AssertionError: 

In [134]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab

ev = []
for i in range(10):
    y_true = predicted_vs_actual[:,i]
    y_pred = predicted_vs_actual[:,i+10]
    ev.append(explained_variance_score(y_true, y_pred))
    
print("Explained variance: ",ev)
diff = y_true-y_pred

#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()

num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)

"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)

# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')

# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')


# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()

fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""


Explained variance:  [0.81146699190139771, 0.85759055614471436, 0.86870527267456055, 0.89061909914016724, 0.88695210218429565, 0.74401175975799561, 0.8059995174407959, 0.80061358213424683, 0.45739859342575073, 0.35861217975616455]
(1.4245720507513746, -0.030474568797174348, 0.062562678013771522)
Out[134]:
"\nnum_bins = 100\n# the histogram of the errors\nn, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)\n\n# add a normal PDF\nmu = 0\nsigma = .05\ny = mlab.normpdf(bins, mu, sigma)\nplt.plot(bins, y, 'r-')\nplt.xlabel('Smarts')\nplt.ylabel('Probability')\n\n# add Cauchy PDF\nparams = cauchy.fit(diff)\nprint(params)\ndist = cauchy(params[0], params[1])\nx = np.linspace(-2, 2, num_bins)\nplt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')\n\n\n# Tweak spacing to prevent clipping of ylabel\n#plt.subplots_adjust(left=0.15)\n#plt.show()\n\nfig = plt.figure(figsize=(10,6),dpi=80)\nplt.hist(diff, bins = 100, alpha=0.5)\nplt.show()\n"