In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy
In [3]:
# Download and save the archived data
url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
print(os.path.join(to,filename))
print(url+filename)
if force or not os.path.exists(os.path.join(to,filename)):
filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
statinfo = os.stat(os.path.join(to,filename))
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify' + filename + '. Can you get to it with a browser?')
return filename
data_filename = maybe_download('ann_dataset1.tar', 5642240)
In [4]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
root = os.path.dirname(filename)
if os.path.isdir(extract_folder) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(path = root)
tar.close()
data_files = [
os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
if os.path.isdir(extract_folder)]
return data_files
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)
# Load files and produce dataset
def maybe_load(file_names):
names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
datafile_length = 193
dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
for i in range(0,len(file_names)):
a = np.loadtxt(file_names[i])
a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
dataset[i,:,:] = a
if i%100 == 0:
print("Processed %d/%d \n"%(i,len(file_names)))
return dataset
dataset = maybe_load(data_files)
print(dataset.shape)
In [7]:
# train, validation, and test dataset percentages
train_percent = 70
valid_percent = 15
test_percent = 15
# train, validation, and test dataset indices
# test: test_start : valid_start-1
# validation: valid_start : train_start-1
# training: train_start : dataset.shape[0]
test_start = 0
valid_start = 48 #int(test_percent/100.0*dataset.shape[0])
train_start = 48 + 48 #int((test_percent+valid_percent)/100.0*dataset.shape[0])
# Shuffle file indices
file_indices = range(dataset.shape[0])
np.random.shuffle(file_indices)
# Assign datasets
test_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(test_start, valid_start)]])
valid_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(valid_start, train_start)]])
train_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(train_start, dataset.shape[0])]])
# Save memory
#del(dataset)
print("Test dataset: "+str(test_dataset.shape))
print("Validation dataset: "+str(valid_dataset.shape))
print("Training dataset: "+str(train_dataset.shape))
In [44]:
location = 3
def accuracy_mse(predictions, outputs):
err = predictions-outputs
return np.mean(err*err)
def Normalize(x, means, stds):
return (x-means)/stds
# Reshape the data and normalize
train_dataset2 = train_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32)
train_output = train_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)
# calculate means and stds for training dataset
input_means = [np.mean(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
input_stds = [np.std(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
output_means = [np.mean(train_output[:,i]) for i in range(train_output.shape[1])]
output_stds = [np.std(train_output[:,i]) for i in range(train_output.shape[1])]
train_dataset2 = Normalize(train_dataset2, input_means, input_stds)
#train_output = Normalize(train_output, output_means, output_stds)
print(train_dataset2.shape)
print(train_output.shape)
#plt.plot(train_dataset2[:193,:],label="input")
#plt.plot(train_output[:193,:],label="output")
#plt.ylabel("training data")
#plt.legend(loc='upper right', shadow=True)
#plt.show()
valid_dataset2 = Normalize(valid_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32), input_means, input_stds)
valid_output = valid_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)
#valid_output = Normalize(valid_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)
test_dataset2 = Normalize(test_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32),input_means, input_stds)
test_output = test_dataset[:,:,8+location].reshape((-1, 1)).astype(np.float32)
#test_output = Normalize(test_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)
In [21]:
num_embeds = 2 # number of embeddings, i.e. 0 -- the original array
#print(train_dataset2[:-1,:])
#print(train_dataset2[1:,:])
def get_embeddings(dataset, num_embeds = 1):
dataset_list = []
if num_embeds == 0:
return dataset
for i in range(num_embeds):
dataset_list.append(dataset[i:(-num_embeds+i),:])
#dataset_list.append(dataset[num_embeds:,:])
return np.hstack(dataset_list)
train_dataset2 = get_embeddings(train_dataset2, num_embeds)
valid_dataset2 = get_embeddings(valid_dataset2, num_embeds)
test_dataset2 = get_embeddings(test_dataset2, num_embeds)
train_output = train_output[num_embeds:,:]
valid_output = valid_output[num_embeds:,:]
test_output = test_output[num_embeds:,:]
In [45]:
def variance(x):
return tf.reduce_mean(tf.square(x-tf.reduce_mean(x)))
def explained_var(y_true, y_predicted):
return 1 - tf.div(variance(tf.sub(y_true,y_predicted)),variance(y_true))
input_size = train_dataset2.shape[1]
output_size = 1
# Deep ANN
batch_size = 57*193
hidden_nodes = 32 #64
num_steps = 30001
starter_learning_rate = 0.02
graph = tf.Graph()
with graph.as_default():
x = tf.placeholder(tf.float32, shape=(None, input_size)) #train_dataset2.shape(2)
y = tf.placeholder(tf.float32, shape=(None, output_size))
weights_0 = tf.Variable(tf.truncated_normal([input_size,hidden_nodes], stddev = 0.1, dtype = tf.float32))
biases_0 = tf.Variable(tf.zeros([hidden_nodes], dtype = tf.float32))
input_layer = tf.tanh(tf.matmul(x, weights_0) + biases_0)
weights_1 = tf.Variable(tf.truncated_normal([hidden_nodes, output_size], stddev = 0.1, dtype = tf.float32))
biases_1 = tf.Variable(tf.zeros([output_size], dtype = tf.float32))
y_ = tf.matmul(input_layer, weights_1) + biases_1
regularizers = sum([tf.nn.l2_loss(v) for v in tf.all_variables()])
loss = tf.reduce_mean(tf.square(y_-y))
loss += 1e-5 * regularizers
global_step = tf.Variable(0.0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 0.5, staircase=False)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
"""
train_prediction = loss
valid_prediction = tf.tanh(tf.matmul(tf_valid_dataset, weights_0) + biases_0)
valid_prediction = tf.tanh(tf.matmul(valid_prediction, weights_1) + biases_1)
valid_prediction = tf.matmul(valid_prediction, weights_2) + biases_2
test_prediction = tf.tanh(tf.matmul(tf_test_dataset, weights_0) + biases_0)
test_prediction = tf.tanh(tf.matmul(test_prediction, weights_1) + biases_1)
test_prediction = tf.matmul(test_prediction, weights_2) + biases_2
"""
In [46]:
def stop(data, n = 3):
assert len(data) > n*2
avg = sum(data)/len(data)
for x in (data-avg)[-n:]:
if x <= 0:
return False
return True
from sklearn.metrics import explained_variance_score
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
sum_l = 0
num_l = 0
#ev_l = [0,0,0,0,0,0,0]
print('Initialized')
for step in range(num_steps):
offset = (step * batch_size) % (train_output.shape[0] - batch_size)
batch_data = train_dataset2[offset:(offset + batch_size), :]
batch_output = train_output[offset:(offset + batch_size), :]
feed_dict = {x : batch_data, y : batch_output}
start_time = time.time()
_, l, r = session.run([optimizer, loss, regularizers],feed_dict=feed_dict)
duration = time.time()-start_time
sum_l += l
num_l += 1
if (step % 500 == 0):
valid_loss = loss.eval(feed_dict = {x: valid_dataset2, y: valid_output})
#print(predictions)
#ev = explained_variance_score(y_.eval(feed_dict = {x: valid_dataset2, y: valid_output}), valid_output)
#ev_l.append(valid_loss)
#ev_l = ev_l[1:]
print('Loss at step %d (%.2f op/sec): %.6f (%.6f); validation loss: %.6f' % (
step, 1.0/duration, sum_l/num_l, r,
#accuracy_mse(valid_prediction.eval(), valid_output)))
valid_loss))
sum_l = 0
num_l = 0
#if stop(ev_l):
# print("Non decreasing scores, so stopping early")
# break
feed_dict = {x: test_dataset2, y: test_output}
predictions, test_loss = session.run([y_, loss],feed_dict=feed_dict)
#test_loss = loss.eval(feed_dict = {x: test_dataset2, y: test_output})
print('Test MSE: %.4f' % test_loss)
#print('Test losses:', test_losses)
predicted_vs_actual = np.hstack((predictions, test_output))
In [47]:
over_95 = 0
print(predicted_vs_actual.shape)
cc = np.corrcoef(predicted_vs_actual[:,0],predicted_vs_actual[:,1])[0,1]
m = np.max(np.abs(predicted_vs_actual[:,0]-predicted_vs_actual[:,1]))
mse = np.sqrt(np.mean(np.square(predicted_vs_actual[:,0]-predicted_vs_actual[:,1])))
print("Results for location %d: R^2:%.4f, MSE:%.6f, Max error: %.4f"%(location, cc,mse,m))
k = np.argmax(np.abs(predicted_vs_actual[:,0]-predicted_vs_actual[:,1]))
print(k)
start = (k // 193)*193
stop = start + 193
#start = 0
#stop = 20*193
print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)
for i in range(1):
sp = fig.add_subplot(1,1,i+1)
sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
sp.plot(predicted_vs_actual[start:stop,i+1],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()
In [231]:
d = np.asarray([])
d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)
def stop(data, n = 2):
assert len(data) > n*2
avg = sum(data)/len(data)
for x in (data-avg)[-n:]:
if x <= 0:
return False
return True
print(stop(d))
In [134]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab
ev = []
for i in range(10):
y_true = predicted_vs_actual[:,i]
y_pred = predicted_vs_actual[:,i+10]
ev.append(explained_variance_score(y_true, y_pred))
print("Explained variance: ",ev)
diff = y_true-y_pred
#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)
"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')
# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()
fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""
Out[134]: