In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import scipy
In [2]:
# Download and save the archived data
url = 'http://mrtee.europa.renci.org/~bblanton/ANN/'
to = "../data"
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
print(os.path.join(to,filename))
print(url+filename)
if force or not os.path.exists(os.path.join(to,filename)):
filename, _ = urlretrieve(url + filename, os.path.join(to,filename))
statinfo = os.stat(os.path.join(to,filename))
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify' + filename + '. Can you get to it with a browser?')
return filename
data_filename = maybe_download('ann_dataset1.tar', 5642240)
In [7]:
# Ten output data set
# Extract files from the archive
def maybe_extract(filename, force=False):
extract_folder = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
root = os.path.dirname(filename)
if os.path.isdir(extract_folder) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(path = root)
tar.close()
data_files = [
os.path.join(extract_folder, d) for d in sorted(os.listdir(extract_folder))
if os.path.isdir(extract_folder)]
return data_files
data_filename = "../data/ann_dataset_10points.tar"
data_files = maybe_extract(data_filename)
# Load files and produce dataset
def maybe_load(file_names):
names = ('index','time', 'long', 'lat', 'param1', 'param2', 'param3', 'param4', 'out1', 'out2',
'out3', 'out4','out5', 'out6','out7', 'out8','out9', 'out10',)
datafile_length = 193
dataset = np.ndarray(shape=(len(file_names), datafile_length, len(names)))
for i in range(0,len(file_names)):
a = np.loadtxt(file_names[i])
a = np.asarray([x for xs in a for x in xs],dtype='d').reshape([datafile_length,len(names)])
dataset[i,:,:] = a
if i%100 == 0:
print("Processed %d/%d \n"%(i,len(file_names)))
return dataset
dataset = maybe_load(data_files)
print(dataset.shape)
In [157]:
"""
We need to convert the dataset into b(x) = [1 x1 x2 ... x6 x1^2 x1x2 x1x3 ... x6^2]
"""
def Normalize(x, means, stds):
return (x-means)/stds
flat_dataset = np.reshape(dataset,(-1,18)) #
inputs_only = flat_dataset[:,1:7]
outputs_only = flat_dataset[:,8+9:9+9]
input_means = [np.mean(inputs_only) for i in range(inputs_only.shape[1])]
input_stds = [np.std(inputs_only) for i in range(inputs_only.shape[1])]
inputs_only = (inputs_only-input_means)/input_stds
inputs_quadratic = np.reshape(np.ones(inputs_only.shape[0]),(-1,1))
inputs_quadratic = np.hstack((inputs_quadratic, inputs_only))
for j in range(6):
for k in range(j,6):
ds = np.reshape(np.multiply(inputs_only[:,i],inputs_only[:,j]),(-1,1))
inputs_quadratic = np.hstack((inputs_quadratic, ds))
test_start = 0
valid_start = 48*193 #int(test_percent/100.0*dataset.shape[0])
train_start = (48+48)*193 #int((test_percent+valid_percent)/100.0*dataset.shape[0])
test_dataset = np.hstack((inputs_quadratic[test_start:valid_start,:],outputs_only[test_start:valid_start,:]))
valid_dataset = np.hstack((inputs_quadratic[valid_start:train_start,:],outputs_only[valid_start:train_start,:]))
train_dataset = np.hstack((inputs_quadratic[train_start:,:],outputs_only[train_start:,:]))
In [217]:
print(train_dataset.shape)
B = train_dataset
x = test_dataset[234,1:7] # grab a random point from the test dataset
d = train_dataset[:,1:7]-x
print(d.shape)
In [207]:
num_steps = 1001
starter_learning_rate = 0.001
graph = tf.Graph()
with graph.as_default():
support_x = tf.placeholder(tf.float32, shape=(None, 28)) #set of support vectors, i.e. initially all training set
support_y_ = tf.placeholder(tf.float32, shape=(None, 1)) #set of support vectors outs
x = tf.placeholder(tf.float32, shape=(None, 28)) #set of true vectors, i.e. initially all training set
y_ = tf.placeholder(tf.float32, shape=(None, 1)) #set of true vectors outs
#d =
"""
# coefficients a
a_matrix = tf.Variable(tf.truncated_normal([28,1], stddev = 0.1, dtype = tf.float32))
# calculate outputs at support vectors
support_y = tf.matmul(support_x,a_matrix)
# support vectors weights
w_matrix = tf.Variable(tf.truncated_normal([batch_size,1], stddev = 0.1, dtype = tf.float32))
# calculate losses at support vectors to optimize a_matrix and w_matrix
weighted_diff = tf.matmul(tf.transpose(tf.square(support_y-support_y_)),tf.abs(w_matrix))
loss = tf.reduce_mean(weighted_diff)
global_step = tf.Variable(0.0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, num_steps, 1.0, staircase=False)
# Create ADAM optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
# Calculate gradients and apply them
grads, v = zip(*optimizer.compute_gradients(loss))
grads, _ = tf.clip_by_global_norm(grads, 1.25)
apply_gradient_op = optimizer.apply_gradients(zip(grads,v), global_step = global_step)
"""
In [208]:
from sklearn.metrics import explained_variance_score
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print('Initialized')
for step in range(num_steps):
#offset = (step * batch_size) % (train_output.shape[0] - batch_size)
#batch_data = train_dataset[offset:(offset + batch_size),:28].astype(np.float32)
#batch_output = train_dataset[offset:(offset + batch_size), 28:].astype(np.float32)
batch_data = train_dataset[:,:28].astype(np.float32)
batch_output = train_dataset[:, 28:].astype(np.float32)
feed_dict = {x : batch_data, y_ : batch_output}
start_time = time.time()
_, l = session.run([apply_gradient_op, loss],feed_dict=feed_dict)
duration = time.time()-start_time
if (step % 50 == 0):
#valid_loss = loss.eval(feed_dict = {x: valid_dataset[:,:28], y_: valid_dataset[:,28:]})
valid_loss = 0
#print(predictions)
#ev = explained_variance_score(y_.eval(feed_dict = {x: valid_dataset2, y: valid_output}), valid_output)
#ev_l.append(valid_loss)
#ev_l = ev_l[1:]
print('Loss at step %d (%.2f op/sec): %f; validation loss: %.6f' % (
step, 1.0/duration, l,
#accuracy_mse(valid_prediction.eval(), valid_output)))
valid_loss))
#if stop(ev_l):
# print("Non decreasing scores, so stopping early")
# break
#feed_dict = {x: test_dataset[:,:28],y_: test_dataset[:,28:]}
#predictions, test_loss = session.run([y, loss],feed_dict=feed_dict)
#test_loss = loss.eval(feed_dict = {x: test_dataset2, y: test_output})
#print('Test MSE: %.4f' % test_loss)
#print('Test losses:', test_losses)
#predicted_vs_actual = np.hstack((predictions, test_output))
In [156]:
over_95 = 0
for i in range(10):
cc = np.corrcoef(predicted_vs_actual[:,i],predicted_vs_actual[:,i+10])[0,1]
m = np.max(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
mse = np.mean(np.sqrt(np.square(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10])))
if cc >= 0.95:
over_95+=1
print('Point %d: Max loss: %.6f, MSE: %.6f CC: %.6f %c' % (i,m, mse, cc, 'v' if cc >= 0.95 else ' '))
i = 5
k = np.argmax(np.abs(predicted_vs_actual[:,i]-predicted_vs_actual[:,i+10]))
print(k)
max_error_case = [np.abs(predicted_vs_actual[k,i]-predicted_vs_actual[k,i+10]) for i in range(10)]
print(max_error_case)
start = (k // 193)*193
stop = start + 193
start = 0
stop = 20*193
print(predicted_vs_actual.shape)
fig = plt.figure(figsize=(10, 6), dpi=80)
for i in range(10):
sp = fig.add_subplot(10,1,i+10)
sp.plot(predicted_vs_actual[start:stop,i],color="blue", linewidth=1.0, linestyle="-", label="ANN")
sp.plot(predicted_vs_actual[start:stop,i+10],color="red", linewidth=1.0, linestyle="-", label="actual")
plt.show()
In [231]:
d = np.asarray([])
d = np.append(d,3)
d = np.append(d,2)
d = np.append(d,1)[1:]
print(d)
def stop(data, n = 2):
assert len(data) > n*2
avg = sum(data)/len(data)
for x in (data-avg)[-n:]:
if x <= 0:
return False
return True
print(stop(d))
In [134]:
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
import pylab
ev = []
for i in range(10):
y_true = predicted_vs_actual[:,i]
y_pred = predicted_vs_actual[:,i+10]
ev.append(explained_variance_score(y_true, y_pred))
print("Explained variance: ",ev)
diff = y_true-y_pred
#stats.probplot(diff, dist="norm", plot=pylab)
stats.probplot(diff, dist="t", sparams=(2), plot=pylab)
pylab.show()
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
params = stats.t.fit(diff)
dist = stats.t(params[0], params[1], params[2])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', lw = 3, alpha=0.5, label='t pdf')
plt.show()
print(params)
"""
num_bins = 100
# the histogram of the errors
n, bins, patches = plt.hist(diff, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a normal PDF
mu = 0
sigma = .05
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r-')
plt.xlabel('Smarts')
plt.ylabel('Probability')
# add Cauchy PDF
params = cauchy.fit(diff)
print(params)
dist = cauchy(params[0], params[1])
x = np.linspace(-2, 2, num_bins)
plt.plot(x, dist.pdf(x), 'b-', alpha=0.5, label='cauchy pdf')
# Tweak spacing to prevent clipping of ylabel
#plt.subplots_adjust(left=0.15)
#plt.show()
fig = plt.figure(figsize=(10,6),dpi=80)
plt.hist(diff, bins = 100, alpha=0.5)
plt.show()
"""
Out[134]: