In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy
In [3]:
# Download and save the archived data
url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
print(os.path.join(to,filename))
print(url+filename)
if force or not os.path.exists(os.path.join(to,filename)):
filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
statinfo = os.stat(os.path.join(to,filename))
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify' + filename + '. Can you get to it with a browser?')
return filename
data_filename = maybe_download('ann_dataset1.tar', 5642240)
In [4]:
# Two output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
root = os.path.dirname(filename)
if os.path.isdir(extract_folder) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(path = root)
tar.close()
data_files = [
os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
if os.path.isdir(extract_folder)]
return data_files
data_filename = "../data/ann_dataset1.tar"
data_files = maybe_extract(data_filename)
# Load files and produce dataset
def maybe_load(file_names):
names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2')
datafile_length = 193
dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
for i in range(0,len(file_names)):
a = np.loadtxt(file_names[i])
a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
dataset[i,:,:] = a
if i%100 == 0:
print("Processed %d/%d \n"%(i,len(file_names)))
return dataset
dataset = maybe_load(data_files)
print(dataset.shape)
In [5]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
root = os.path.dirname(filename)
if os.path.isdir(extract_folder) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(path = root)
tar.close()
data_files = [
os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
if os.path.isdir(extract_folder)]
return data_files
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)
# Load files and produce dataset
def maybe_load(file_names):
names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
datafile_length = 193
dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
for i in range(0,len(file_names)):
a = np.loadtxt(file_names[i])
a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
dataset[i,:,:] = a
if i%100 == 0:
print("Processed %d/%d \n"%(i,len(file_names)))
return dataset
dataset = maybe_load(data_files)
print(dataset.shape)
In [43]:
# train, validation, and test dataset percentages
train_percent = 70
valid_percent = 15
test_percent = 15
# train, validation, and test dataset indices
# test: test_start : valid_start-1
# validation: valid_start : train_start-1
# training: train_start : dataset.shape[0]
test_start = 0
valid_start = 48 #int(test_percent/100.0*dataset.shape[0])
train_start = 48 + 48 #int((test_percent+valid_percent)/100.0*dataset.shape[0])
# Shuffle file indices
file_indices = range(dataset.shape[0])
np.random.shuffle(file_indices)
# Assign datasets
test_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(test_start, valid_start)]])
valid_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(valid_start, train_start)]])
train_dataset = np.array([dataset[j,:,:] for j in [file_indices[i] for i in range(train_start, dataset.shape[0])]])
# Save memory
#del(dataset)
print("Test dataset: "+str(test_dataset.shape))
print("Validation dataset: "+str(valid_dataset.shape))
print("Training dataset: "+str(train_dataset.shape))
In [44]:
def accuracy_mse(predictions, outputs):
err = predictions-outputs
return np.mean(err*err)
def Normalize(x, means, stds):
return (x-means)/stds
# Reshape the data and normalize
train_dataset2 = train_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32)
train_output = train_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
print("train outputs: ",train_dataset.shape)
# calculate means and stds for training dataset
input_means = [np.mean(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
input_stds = [np.std(train_dataset2[:,i]) for i in range(train_dataset2.shape[1])]
output_means = [np.mean(train_output[:,i]) for i in range(train_output.shape[1])]
output_stds = [np.std(train_output[:,i]) for i in range(train_output.shape[1])]
train_dataset2 = Normalize(train_dataset2, input_means, input_stds)
#train_output = Normalize(train_output, output_means, output_stds)
print(train_dataset2.shape)
print(train_output.shape)
#plt.plot(train_dataset2[:193,:],label="input")
#plt.plot(train_output[:193,:],label="output")
#plt.ylabel("training data")
#plt.legend(loc='upper right', shadow=True)
#plt.show()
valid_dataset2 = Normalize(valid_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32), input_means, input_stds)
valid_output = valid_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#valid_output = Normalize(valid_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)
test_dataset2 = Normalize(test_dataset[:,:,1:7].reshape((-1, 6)).astype(np.float32),input_means, input_stds)
test_output = test_dataset[:,:,8:18].reshape((-1, 10)).astype(np.float32)
#test_output = Normalize(test_dataset[:,:,8:18].reshape((-1, 2)).astype(np.float32), output_means, output_stds)
In [13]:
num_embeds = 2 # number of embeddings, i.e. 0 -- the original array
#print(train_dataset2[:-1,:])
#print(train_dataset2[1:,:])
def get_embeddings(dataset, num_embeds = 1):
dataset_list = []
if num_embeds == 0:
return dataset
for i in range(num_embeds):
dataset_list.append(dataset[i:(-num_embeds+i),:])
#dataset_list.append(dataset[num_embeds:,:])
return np.hstack(dataset_list)
train_dataset2 = get_embeddings(train_dataset2, num_embeds)
valid_dataset2 = get_embeddings(valid_dataset2, num_embeds)
test_dataset2 = get_embeddings(test_dataset2, num_embeds)
train_output = train_output[num_embeds:,:]
valid_output = valid_output[num_embeds:,:]
test_output = test_output[num_embeds:,:]
In [8]:
import sklearn.decomposition as deco
print(train_dataset2.shape)
x = train_dataset2
n_components=1
pca = deco.PCA(n_components = 'mle') # n_components is the components number after reduction
_r = pca.fit(x).transform(x)
#ca.fit(x)
print('n_components:',pca.n_components_)
#for x in pca.components_:
# print(x)
print ('explained variance (first %d components): %.2f'%(n_components, sum(pca.explained_variance_ratio_)))
#===================================================
In [93]:
def variance(x):
return tf.reduce_mean(tf.square(x-tf.reduce_mean(x)))
def explained_var(y_true, y_predicted):
return 1 - tf.div(variance(tf.sub(y_true,y_predicted)),variance(y_true))
input_size = train_dataset2.shape[1]
output_size = 10
# Deep ANN
batch_size = 19*193
hidden_nodes_1 = 128 #64
hidden_nodes_2 = 256 #64
num_steps = 10001
starter_learning_rate = 0.001
graph = tf.Graph()
with graph.as_default():
# Input data.
# Load the training, validation and test data into constants that are
# attached to the graph.
tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, input_size)) #train_dataset2.shape(2)
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, output_size))
tf_valid_dataset = tf.constant(valid_dataset2)
tf_test_dataset = tf.constant(test_dataset2)
weights_0 = tf.Variable(tf.truncated_normal([input_size,hidden_nodes_1], stddev = 0.1, dtype = tf.float32))
biases_0 = tf.Variable(tf.zeros([hidden_nodes_1], dtype = tf.float32))
weights_1 = tf.Variable(tf.truncated_normal([hidden_nodes_1,hidden_nodes_2], stddev = 0.1, dtype = tf.float32))
biases_1 = tf.Variable(tf.zeros([hidden_nodes_2], dtype = tf.float32))
weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes_2,output_size], stddev = 0.1, dtype = tf.float32))
biases_2 = tf.Variable(tf.zeros([output_size], dtype = tf.float32))
# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(weights_0) + tf.nn.l2_loss(biases_0) +
tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(biases_1) +
tf.nn.l2_loss(weights_2) + tf.nn.l2_loss(biases_2))
#regularizers = sum([tf.nn.l2_loss(v) for v in tf.all_variables()])
input_layer_output = tf.tanh(tf.matmul(tf_train_dataset, weights_0) + biases_0)
hidden_layer_output = tf.tanh(tf.matmul(input_layer_output, weights_1) + biases_1)
#hidden_layer_output = tf.nn.dropout(hidden_layer_output, 0.5)
hidden_layer_output = tf.matmul(hidden_layer_output, weights_2) + biases_2
# standard loss is MSE
loss = tf.cast(tf.reduce_mean(tf.square(hidden_layer_output-tf_train_labels)),tf.float32)
# Add the regularization term to the loss.
loss += 1e-5 * regularizers
train_ev = explained_var(tf_train_labels, hidden_layer_output)
global_step = tf.Variable(0.0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 0.4, staircase=False)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
train_prediction = loss
valid_prediction = tf.tanh(tf.matmul(tf_valid_dataset, weights_0) + biases_0)
valid_prediction = tf.tanh(tf.matmul(valid_prediction, weights_1) + biases_1)
valid_prediction = tf.matmul(valid_prediction, weights_2) + biases_2
test_prediction = tf.tanh(tf.matmul(tf_test_dataset, weights_0) + biases_0)
test_prediction = tf.tanh(tf.matmul(test_prediction, weights_1) + biases_1)
test_prediction = tf.matmul(test_prediction, weights_2) + biases_2
In [94]:
def stop(data, n = 3):
assert len(data) > n*2
avg = sum(data)/len(data)
for x in (data-avg)[-n:]:
if x >= 0:
return False
return True
from sklearn.metrics import explained_variance_score
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
sum_l = 0
num_l = 0
ev_l = [0,0,0,0,0,0,0]
print('Initialized')
for step in range(num_steps):
offset = (step * batch_size) % (train_output.shape[0] - batch_size)
batch_data = train_dataset2[offset:(offset + batch_size), :]
batch_output = train_output[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_output}
start_time = time.time()
_, l, reg, predictions = session.run([optimizer, loss, regularizers, train_prediction],feed_dict=feed_dict)
duration = time.time()-start_time
sum_l += l
num_l += 1
if (step % 500 == 0):
ev = explained_variance_score(valid_prediction.eval(), valid_output)
ev_l.append(ev)
ev_l = ev_l[1:]
print('Loss at step %d (%.2f op/sec): %f (%.8f); validation explained variance: %.6f' % (
step, 1.0/duration, sum_l/num_l,
#accuracy_mse(valid_prediction.eval(), valid_output)))
1e-5 * reg,
explained_variance_score(valid_prediction.eval(), valid_output)))
sum_l = 0
num_l = 0
if stop(ev_l):
print("Non increasing scores, so stopping early")
break
print('Test MSE: %.4f' % accuracy_mse(test_prediction.eval(), test_output))
predicted_vs_actual = np.hstack((test_prediction.eval(), test_output))
In [95]:
over_95 = 0
for i in range(10):
cc = np.corrcoef(predicted_vs_actual[:,i],predicted_vs_actual[:,i+10])[0,1]
m = np.max(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
mse = np.mean(np.sqrt(np.square(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10])))
if cc >= 0.95:
over_95+=1
print('Point %d: Max loss: %.6f, MSE: %.6f CC: %.6f %c' % (i,m, mse, cc, 'v' if cc >= 0.95 else ' '))
i = 3
k = np.argmax(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
print(k)
max_error_case = [np.abs(predicted_vs_actual[k,i]-predicted_vs_actual[k,i+10]) for i in range(10)]
print(max_error_case)
start = (k // 193)*193
stop = start + 193
#start = 0
#stop = 20*193
print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)
for i in range(10):
sp = fig.add_subplot(10,1,i+1)
sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
sp.plot(predicted_vs_actual[start:stop,i+10],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()
In [231]:
d = np.asarray([])
d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)
def stop(data, n = 2):
assert len(data) > n*2
avg = sum(data)/len(data)
for x in (data-avg)[-n:]:
if x <= 0:
return False
return True
print(stop(d))
In [88]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab
ev = []
for i in range(10):
y_true = predicted_vs_actual[:,i]
y_pred = predicted_vs_actual[:,i+10]
ev.append(explained_variance_score(y_true, y_pred))
print("Explained variance: ",ev)
diff = y_true-y_pred
#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)
"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')
# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()
fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""
Out[88]: