In [1]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad
from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
a = op.inputs[0]
b = op.inputs[1]
y = op.outputs[0] # y = 0.5 * b / conj(a)
xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
return xGrad, cGrad
import time
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def RBFEuclidean(x, C, number_of_threads=1):
"""Computes distance from cluster centers defined in input C
Both outdim and indim should be integers.
"""
return -euclidean_dist(x,C,number_of_threads)
In [5]:
import inspect
print(inspect.getargspec(euclidean_dist))
In [8]:
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W_fc1 = weight_variable([784, 1000])
W_fc2 = weight_variable([1000, 10])
b_fc2 = bias_variable([10])
h_rbf = RBFEuclidean(x, W_fc1,1)
y_conv = RBFEuclidean(h_rbf, W_fc2,1) + b_fc2
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
start = time.time()
for i in range(20000):
batch = mnist.train.next_batch(50)
if i%100 == 0:
train_accuracy = accuracy.eval(feed_dict={
x:batch[0], y_: batch[1]})
print("\rstep %d, training accuracy %g"%(i, train_accuracy), end="" if i%1000 else "\n")
train_step.run(feed_dict={x: batch[0], y_: batch[1]})
elapsed = time.time()-start
print("\nTraining took {}".format(elapsed))
print("\ntest accuracy %g"%accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels}))
In [1]:
import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad
from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
a = op.inputs[0]
b = op.inputs[1]
y = op.outputs[0] # y = 0.5 * b / conj(a)
xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
return xGrad, cGrad
import time
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def RBFEuclidean(x, C, number_of_threads=1):
"""Computes distance from cluster centers defined in input C
Both outdim and indim should be integers.
"""
return -euclidean_dist(x,C,number_of_threads)
In [2]:
repeat_i_times = 10
In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
sess = tf.InteractiveSession()
results = np.zeros([6,3])
for i in range(repeat_i_times):
for k in range(1,4):
x = tf.placeholder(tf.float32, shape=[None, 10**k])
y_ = tf.placeholder(tf.float32, shape=[None, 10**k])
W = weight_variable([10**k, 10**k])
x_in = np.random.normal(0,1,[10**k,10**k])
W_in = np.random.normal(0,1,[10**k,10**k])
y_conv = RBFEuclidean(x, W)
if True:
start = time.time()
y_conv.eval({x:x_in, W:W_in})
elapsed_time = time.time()-start
results[5,k-1] += elapsed_time/repeat_i_times
print("\nFeed-forward took {} with GPU and {} datapoints".format(elapsed_time, 10**k))
In [4]:
np.save("timings.npy",results)
In [3]:
results = np.load("timings.npy")
In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""
import numpy as np
sess = tf.InteractiveSession()
for i in range(repeat_i_times):
for n in range(0,5):
for k in range(1,4):
x = tf.placeholder(tf.float32, shape=[None, 10**k])
y_ = tf.placeholder(tf.float32, shape=[None, 10**k])
W = weight_variable([10**k, 10**k])
x_in = np.random.normal(0,1,[10**k,10**k])
W_in = np.random.normal(0,1,[10**k,10**k])
y_conv = RBFEuclidean(x, W, 2**n)
if True:
start = time.time()
y_conv.eval({x:x_in, W:W_in})
elapsed_time = time.time()-start
results[n,k-1] += elapsed_time/repeat_i_times
print("\nFeed-forward took {} with {} threads and {} datapoints".format(elapsed_time, 2**n, 10**k))
In [33]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from matplotlib import pyplot as plt
plt.figure()
plt.suptitle("Feedforward", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("Timing With Ten by Ten Sized Matrices")
plt.plot(results[:-1,0])
plt.scatter([5],results[-1:,0])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,2)
plt.title("Timing With a Hundred by Hundred Sized Matrices")
plt.plot(results[:-1,1])
plt.scatter([5],results[-1:,1])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,3)
plt.title("Timing With a Thousand by a Thousand Sized Matrices")
plt.plot(results[:-1,2])
plt.scatter([5],results[-1:,2])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,4)
plt.title("All Timings In Log Scale")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.yscale("log")
plt.tight_layout()
plt.savefig("feedforward.pgf")
plt.show()
np.save("feedforwardtimings.npy",results)
In [ ]:
In [ ]:
In [1]:
import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad
from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
a = op.inputs[0]
b = op.inputs[1]
y = op.outputs[0] # y = 0.5 * b / conj(a)
xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
return xGrad, cGrad
import time
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def RBFEuclidean(x, C, number_of_threads=1):
"""Computes distance from cluster centers defined in input C
Both outdim and indim should be integers.
"""
return -euclidean_dist(x,C,number_of_threads)
In [2]:
repeat_i_times = 10
In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
sess = tf.InteractiveSession()
results = np.zeros([6,3])
for i in range(repeat_i_times):
for k in range(1,4):
x = tf.placeholder(tf.float32, shape=[None, 10**k])
y_ = tf.placeholder(tf.float32, shape=[None, 10**k])
W = weight_variable([10**k, 10**k])
x_in = np.random.normal(0,1,[10**k,10**k])
W_in = np.random.normal(0,1,[10**k,10**k])
y_conv = RBFEuclidean(x, W)
grad = tf.gradients(y_conv,[x,W])
if True:
start = time.time()
tf.get_default_session().run(grad, feed_dict={x:x_in, W:W_in})
elapsed_time = time.time()-start
results[5,k-1] += elapsed_time/repeat_i_times
print("\nBackprop took {} with GPU and {} datapoints".format(elapsed_time, 10**k))
In [5]:
np.save("timings.npy",results)
In [3]:
results = np.load("timings.npy")
In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""
import numpy as np
sess = tf.InteractiveSession()
for i in range(repeat_i_times):
for n in range(0,5):
for k in range(1,4):
x = tf.placeholder(tf.float32, shape=[None, 10**k])
y_ = tf.placeholder(tf.float32, shape=[None, 10**k])
W = weight_variable([10**k, 10**k])
x_in = np.random.normal(0,1,[10**k,10**k])
W_in = np.random.normal(0,1,[10**k,10**k])
y_conv = RBFEuclidean(x, W, 2**n)
grad = tf.gradients(y_conv,[x,W])
if True:
start = time.time()
tf.get_default_session().run(grad, feed_dict={x:x_in, W:W_in})
elapsed_time = time.time()-start
results[n,k-1] += elapsed_time/repeat_i_times
print("\nBackprop took {} with {} threads and {} datapoints".format(elapsed_time, 2**n, 10**k))
In [5]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from matplotlib import pyplot as plt
plt.figure()
plt.suptitle("Backpropagation", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("Timing With Ten by Ten Sized Matrices")
plt.plot(results[:-1,0])
plt.scatter([5],results[-1:,0])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,2)
plt.title("Timing With a Hundred by Hundred Sized Matrices")
plt.plot(results[:-1,1])
plt.scatter([5],results[-1:,1])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,3)
plt.title("Timing With a Thousand by a Thousand Sized Matrices")
plt.plot(results[:-1,2])
plt.scatter([5],results[-1:,2])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.subplot(2,2,4)
plt.title("All Timings In Log Scale")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.yscale("log")
plt.tight_layout()
plt.savefig("backprop.pgf")
plt.show()
np.save("backproptimings.npy",results)
In [108]:
thread_counts = np.array([1,2,4,8,16,32*256])
timings = np.load("feedforwardtimings.npy")
results = timings[0,None]/timings
plt.figure()
plt.suptitle("Feedforward", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("All Speed Ups")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Speed up ratio on one calculation (log scale)")
plt.yscale("log")
plt.legend(["10x10","100x100","1000x1000"],loc=2)
plt.subplot(2,2,2)
plt.title("CPU Only Speed Ups")
plt.plot(results[:-1]-1)
plt.xticks(range(-1,6),["","1", "2", "4", "8", "16", ""])
plt.xlabel("Number of threads")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=2)
plt.subplot(2,2,3)
plt.title("Speed Up Per Thread")
plt.plot((results[1:-1]-1)/thread_counts[1:-1,None])
plt.scatter([4],(results[-1:,0]-1)/thread_counts[-1],color="blue")
plt.scatter([4],(results[-1:,1]-1)/thread_counts[-1], color="green")
plt.scatter([4],(results[-1:,2]-1)/thread_counts[-1], color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=1)
plt.subplot(2,2,4)
def amdahlPortion(speedup,threads):
return threads*(speedup-1)/((threads-1)*speedup)
plt.title("Amdahl's Law Calculated Parallelizable Portion")
plt.plot(amdahlPortion(results[1:-1],thread_counts[1:-1,None]))
plt.scatter([4],amdahlPortion(results[-1:,0],thread_counts[-1]),color="blue")
plt.scatter([4],amdahlPortion(results[-1:,1],thread_counts[-1]), color="green")
plt.scatter([4],amdahlPortion(results[-1:,2],thread_counts[-1]), color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Ratio of parallelizable code to total code")
plt.legend(["10x10","100x100","1000x1000"],loc=10)
plt.tight_layout()
plt.savefig("feedforward2.pgf")
plt.show()
In [91]:
amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None])
Out[91]:
In [92]:
amdahlPortion(results[1:-2,1],thread_counts[1:-2])
Out[92]:
In [93]:
amdahlPortion(results[1:-2,2],thread_counts[1:-2])
Out[93]:
In [94]:
amdahlPortion(results[-1,1:],thread_counts[-1])
Out[94]:
In [95]:
np.average(amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None]))
Out[95]:
In [96]:
np.average(amdahlPortion(results[1:-2,1],thread_counts[1:-2]))
Out[96]:
In [97]:
np.average(amdahlPortion(results[1:-2,2],thread_counts[1:-2]))
Out[97]:
In [98]:
np.average(amdahlPortion(results[-1,1:],thread_counts[-1]))
Out[98]:
In [109]:
thread_counts = np.array([1,2,4,8,16,32*256])
timings = np.load("backproptimings.npy")
results = timings[0,None]/timings
plt.figure()
plt.suptitle("Backpropagation", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("All Speed Ups")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Speed up ratio on one calculation (log scale)")
plt.yscale("log")
plt.legend(["10x10","100x100","1000x1000"],loc=2)
plt.subplot(2,2,2)
plt.title("CPU Only Speed Ups")
plt.plot(results[:-1]-1)
plt.xticks(range(-1,6),["","1", "2", "4", "8", "16", ""])
plt.xlabel("Number of threads")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=5)
plt.subplot(2,2,3)
plt.title("Speed Up Per Thread")
plt.plot((results[1:-1]-1)/thread_counts[1:-1,None])
plt.scatter([4],(results[-1:,0]-1)/thread_counts[-1],color="blue")
plt.scatter([4],(results[-1:,1]-1)/thread_counts[-1], color="green")
plt.scatter([4],(results[-1:,2]-1)/thread_counts[-1], color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=1)
plt.subplot(2,2,4)
def amdahlPortion(speedup,threads):
return threads*(speedup-1)/((threads-1)*speedup)
plt.title("Amdahl's Law Calculated Parallelizable Portion")
plt.plot(amdahlPortion(results[1:-1],thread_counts[1:-1,None]))
plt.scatter([4],amdahlPortion(results[-1:,0],thread_counts[-1]),color="blue")
plt.scatter([4],amdahlPortion(results[-1:,1],thread_counts[-1]), color="green")
plt.scatter([4],amdahlPortion(results[-1:,2],thread_counts[-1]), color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Ratio of parallelizable code to total code")
plt.legend(["10x10","100x100","1000x1000"],loc=2)
plt.tight_layout()
plt.savefig("backprop2.pgf")
plt.show()
In [100]:
amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None])
Out[100]:
In [101]:
amdahlPortion(results[1:-2,1],thread_counts[1:-2])
Out[101]:
In [102]:
amdahlPortion(results[1:-2,2],thread_counts[1:-2])
Out[102]:
In [103]:
amdahlPortion(results[-1,1:],thread_counts[-1])
Out[103]:
In [104]:
np.average(amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None]))
Out[104]:
In [105]:
np.average(amdahlPortion(results[1:-2,1],thread_counts[1:-2]))
Out[105]:
In [106]:
np.average(amdahlPortion(results[1:-2,2],thread_counts[1:-2]))
Out[106]:
In [107]:
np.average(amdahlPortion(results[-1,1:],thread_counts[-1]))
Out[107]:
In [ ]: