In [1]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad

from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
    a = op.inputs[0]
    b = op.inputs[1]
    y = op.outputs[0]  # y = 0.5 * b / conj(a)
    xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
    return xGrad, cGrad

import time


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def RBFEuclidean(x, C, number_of_threads=1):
    """Computes distance from cluster centers defined in input C
    
    Both outdim and indim should be integers.
    """
    return -euclidean_dist(x,C,number_of_threads)


Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

In [5]:
import inspect
print(inspect.getargspec(euclidean_dist))


ArgSpec(args=['data', 'clusters', 'number_of_threads', 'name'], varargs=None, keywords=None, defaults=(None, None))

In [8]:
sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

W_fc1 = weight_variable([784, 1000])
W_fc2 = weight_variable([1000, 10])
b_fc2 = bias_variable([10])

h_rbf = RBFEuclidean(x, W_fc1,1)

y_conv = RBFEuclidean(h_rbf, W_fc2,1) + b_fc2

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
start = time.time()
for i in range(20000):
  batch = mnist.train.next_batch(50)
  if i%100 == 0:
    train_accuracy = accuracy.eval(feed_dict={
        x:batch[0], y_: batch[1]})
    print("\rstep %d, training accuracy %g"%(i, train_accuracy), end="" if i%1000 else "\n")
  train_step.run(feed_dict={x: batch[0], y_: batch[1]})

elapsed = time.time()-start
print("\nTraining took {}".format(elapsed))
print("\ntest accuracy %g"%accuracy.eval(feed_dict={
    x: mnist.test.images, y_: mnist.test.labels}))


step 0, training accuracy 0.18
step 1000, training accuracy 0.16
step 1200, training accuracy 0.06
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-318c4e6dd2cd> in <module>()
     24         x:batch[0], y_: batch[1]})
     25     print("\rstep %d, training accuracy %g"%(i, train_accuracy), end="" if i%1000 else "\n")
---> 26   train_step.run(feed_dict={x: batch[0], y_: batch[1]})
     27 
     28 elapsed = time.time()-start

/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py in run(self, feed_dict, session)
   1617         none, the default session will be used.
   1618     """
-> 1619     _run_using_default_session(self, feed_dict, self.graph, session)
   1620 
   1621 

/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py in _run_using_default_session(operation, feed_dict, graph, session)
   3794                        "the operation's graph is different from the session's "
   3795                        "graph.")
-> 3796   session.run(operation, feed_dict)
   3797 
   3798 

/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    715     try:
    716       result = self._run(None, fetches, feed_dict, options_ptr,
--> 717                          run_metadata_ptr)
    718       if run_metadata:
    719         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    913     if final_fetches or final_targets:
    914       results = self._do_run(handle, final_targets, final_fetches,
--> 915                              feed_dict_string, options, run_metadata)
    916     else:
    917       results = []

/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
    963     if handle is None:
    964       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
--> 965                            target_list, options, run_metadata)
    966     else:
    967       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
    970   def _do_call(self, fn, *args):
    971     try:
--> 972       return fn(*args)
    973     except errors.OpError as e:
    974       message = compat.as_text(e.message)

/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
    952         return tf_session.TF_Run(session, options,
    953                                  feed_dict, fetch_list, target_list,
--> 954                                  status, run_metadata)
    955 
    956     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [1]:
import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad

from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
    a = op.inputs[0]
    b = op.inputs[1]
    y = op.outputs[0]  # y = 0.5 * b / conj(a)
    xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
    return xGrad, cGrad

import time


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def RBFEuclidean(x, C, number_of_threads=1):
    """Computes distance from cluster centers defined in input C
    
    Both outdim and indim should be integers.
    """
    return -euclidean_dist(x,C,number_of_threads)

In [2]:
repeat_i_times = 10

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

sess = tf.InteractiveSession()

results = np.zeros([6,3])
for i in range(repeat_i_times):
    for k in range(1,4):
        x = tf.placeholder(tf.float32, shape=[None, 10**k])
        y_ = tf.placeholder(tf.float32, shape=[None, 10**k])

        W = weight_variable([10**k, 10**k])

        x_in = np.random.normal(0,1,[10**k,10**k])
        W_in = np.random.normal(0,1,[10**k,10**k])

        y_conv = RBFEuclidean(x, W)
        if True:
            start = time.time()
            y_conv.eval({x:x_in, W:W_in})
            elapsed_time = time.time()-start
            results[5,k-1] += elapsed_time/repeat_i_times
            print("\nFeed-forward took {} with GPU and {} datapoints".format(elapsed_time, 10**k))


Feed-forward took 0.09167695045471191 with GPU and 10 datapoints

Feed-forward took 0.00390625 with GPU and 100 datapoints

Feed-forward took 0.2682304382324219 with GPU and 1000 datapoints

Feed-forward took 0.004080772399902344 with GPU and 10 datapoints

Feed-forward took 0.003939628601074219 with GPU and 100 datapoints

Feed-forward took 0.2681541442871094 with GPU and 1000 datapoints

Feed-forward took 0.0038886070251464844 with GPU and 10 datapoints

Feed-forward took 0.004070758819580078 with GPU and 100 datapoints

Feed-forward took 0.2671811580657959 with GPU and 1000 datapoints

Feed-forward took 0.004770040512084961 with GPU and 10 datapoints

Feed-forward took 0.0046041011810302734 with GPU and 100 datapoints

Feed-forward took 0.2686278820037842 with GPU and 1000 datapoints

Feed-forward took 0.004425764083862305 with GPU and 10 datapoints

Feed-forward took 0.004774570465087891 with GPU and 100 datapoints

Feed-forward took 0.2672851085662842 with GPU and 1000 datapoints

Feed-forward took 0.005418539047241211 with GPU and 10 datapoints

Feed-forward took 0.0050122737884521484 with GPU and 100 datapoints

Feed-forward took 0.26698946952819824 with GPU and 1000 datapoints

Feed-forward took 0.004907846450805664 with GPU and 10 datapoints

Feed-forward took 0.0053844451904296875 with GPU and 100 datapoints

Feed-forward took 0.2674129009246826 with GPU and 1000 datapoints

Feed-forward took 0.00696110725402832 with GPU and 10 datapoints

Feed-forward took 0.00733184814453125 with GPU and 100 datapoints

Feed-forward took 0.2689547538757324 with GPU and 1000 datapoints

Feed-forward took 0.006661891937255859 with GPU and 10 datapoints

Feed-forward took 0.00678253173828125 with GPU and 100 datapoints

Feed-forward took 0.2684810161590576 with GPU and 1000 datapoints

Feed-forward took 0.007348060607910156 with GPU and 10 datapoints

Feed-forward took 0.006922483444213867 with GPU and 100 datapoints

Feed-forward took 0.26943349838256836 with GPU and 1000 datapoints

In [4]:
np.save("timings.npy",results)

In [3]:
results = np.load("timings.npy")

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np

sess = tf.InteractiveSession()
for i in range(repeat_i_times):
    for n in range(0,5):
        for k in range(1,4):
            x = tf.placeholder(tf.float32, shape=[None, 10**k])
            y_ = tf.placeholder(tf.float32, shape=[None, 10**k])

            W = weight_variable([10**k, 10**k])

            x_in = np.random.normal(0,1,[10**k,10**k])
            W_in = np.random.normal(0,1,[10**k,10**k])

            y_conv = RBFEuclidean(x, W, 2**n)
            if True:
                start = time.time()
                y_conv.eval({x:x_in, W:W_in})
                elapsed_time = time.time()-start
                results[n,k-1] += elapsed_time/repeat_i_times
                print("\nFeed-forward took {} with {} threads and {} datapoints".format(elapsed_time, 2**n, 10**k))


Feed-forward took 0.0032329559326171875 with 1 threads and 10 datapoints

Feed-forward took 0.08902144432067871 with 1 threads and 100 datapoints

Feed-forward took 88.15529537200928 with 1 threads and 1000 datapoints

Feed-forward took 0.0038673877716064453 with 2 threads and 10 datapoints

Feed-forward took 0.0462346076965332 with 2 threads and 100 datapoints

Feed-forward took 46.80621886253357 with 2 threads and 1000 datapoints

Feed-forward took 0.0035529136657714844 with 4 threads and 10 datapoints

Feed-forward took 0.02516341209411621 with 4 threads and 100 datapoints

Feed-forward took 25.509135246276855 with 4 threads and 1000 datapoints

Feed-forward took 0.004381418228149414 with 8 threads and 10 datapoints

Feed-forward took 0.02704787254333496 with 8 threads and 100 datapoints

Feed-forward took 17.36546564102173 with 8 threads and 1000 datapoints

Feed-forward took 0.004401445388793945 with 16 threads and 10 datapoints

Feed-forward took 0.030051231384277344 with 16 threads and 100 datapoints

Feed-forward took 17.46496081352234 with 16 threads and 1000 datapoints

Feed-forward took 0.004831075668334961 with 1 threads and 10 datapoints

Feed-forward took 0.09022903442382812 with 1 threads and 100 datapoints

Feed-forward took 88.48209857940674 with 1 threads and 1000 datapoints

Feed-forward took 0.00477147102355957 with 2 threads and 10 datapoints

Feed-forward took 0.04735517501831055 with 2 threads and 100 datapoints

Feed-forward took 44.32245683670044 with 2 threads and 1000 datapoints

Feed-forward took 0.00843954086303711 with 4 threads and 10 datapoints

Feed-forward took 0.04159379005432129 with 4 threads and 100 datapoints

Feed-forward took 26.77450180053711 with 4 threads and 1000 datapoints

Feed-forward took 0.005580425262451172 with 8 threads and 10 datapoints

Feed-forward took 0.030315876007080078 with 8 threads and 100 datapoints

Feed-forward took 17.26765489578247 with 8 threads and 1000 datapoints

Feed-forward took 0.0065190792083740234 with 16 threads and 10 datapoints

Feed-forward took 0.029532432556152344 with 16 threads and 100 datapoints

Feed-forward took 17.398097038269043 with 16 threads and 1000 datapoints

Feed-forward took 0.005860090255737305 with 1 threads and 10 datapoints

Feed-forward took 0.09119391441345215 with 1 threads and 100 datapoints

Feed-forward took 87.64925146102905 with 1 threads and 1000 datapoints

Feed-forward took 0.0063323974609375 with 2 threads and 10 datapoints

Feed-forward took 0.048928260803222656 with 2 threads and 100 datapoints

Feed-forward took 44.56292247772217 with 2 threads and 1000 datapoints

Feed-forward took 0.006670236587524414 with 4 threads and 10 datapoints

Feed-forward took 0.03917646408081055 with 4 threads and 100 datapoints

Feed-forward took 27.926311016082764 with 4 threads and 1000 datapoints

Feed-forward took 0.007539272308349609 with 8 threads and 10 datapoints

Feed-forward took 0.0291900634765625 with 8 threads and 100 datapoints

Feed-forward took 17.684712648391724 with 8 threads and 1000 datapoints

Feed-forward took 0.007737159729003906 with 16 threads and 10 datapoints

Feed-forward took 0.02843499183654785 with 16 threads and 100 datapoints

Feed-forward took 17.50123953819275 with 16 threads and 1000 datapoints

Feed-forward took 0.007530927658081055 with 1 threads and 10 datapoints

Feed-forward took 0.10113716125488281 with 1 threads and 100 datapoints

Feed-forward took 90.45476269721985 with 1 threads and 1000 datapoints

Feed-forward took 0.007920980453491211 with 2 threads and 10 datapoints

Feed-forward took 0.05043148994445801 with 2 threads and 100 datapoints

Feed-forward took 45.97048759460449 with 2 threads and 1000 datapoints

Feed-forward took 0.00815439224243164 with 4 threads and 10 datapoints

Feed-forward took 0.02928471565246582 with 4 threads and 100 datapoints

Feed-forward took 23.64580488204956 with 4 threads and 1000 datapoints

Feed-forward took 0.008615970611572266 with 8 threads and 10 datapoints

Feed-forward took 0.02761697769165039 with 8 threads and 100 datapoints

Feed-forward took 17.635887145996094 with 8 threads and 1000 datapoints

Feed-forward took 0.009515762329101562 with 16 threads and 10 datapoints

Feed-forward took 0.03255105018615723 with 16 threads and 100 datapoints

Feed-forward took 17.444978952407837 with 16 threads and 1000 datapoints

Feed-forward took 0.009038448333740234 with 1 threads and 10 datapoints

Feed-forward took 0.09419751167297363 with 1 threads and 100 datapoints

Feed-forward took 88.99506640434265 with 1 threads and 1000 datapoints

Feed-forward took 0.009431123733520508 with 2 threads and 10 datapoints

Feed-forward took 0.05185437202453613 with 2 threads and 100 datapoints

Feed-forward took 43.805904150009155 with 2 threads and 1000 datapoints

Feed-forward took 0.009900569915771484 with 4 threads and 10 datapoints

Feed-forward took 0.03080129623413086 with 4 threads and 100 datapoints

Feed-forward took 29.398962259292603 with 4 threads and 1000 datapoints

Feed-forward took 0.010571718215942383 with 8 threads and 10 datapoints

Feed-forward took 0.033154964447021484 with 8 threads and 100 datapoints

Feed-forward took 17.367835998535156 with 8 threads and 1000 datapoints

Feed-forward took 0.010890007019042969 with 16 threads and 10 datapoints

Feed-forward took 0.03492546081542969 with 16 threads and 100 datapoints

Feed-forward took 17.79841685295105 with 16 threads and 1000 datapoints

Feed-forward took 0.010779380798339844 with 1 threads and 10 datapoints

Feed-forward took 0.09603428840637207 with 1 threads and 100 datapoints

Feed-forward took 87.34103488922119 with 1 threads and 1000 datapoints

Feed-forward took 0.011073112487792969 with 2 threads and 10 datapoints

Feed-forward took 0.053179264068603516 with 2 threads and 100 datapoints

Feed-forward took 44.32625699043274 with 2 threads and 1000 datapoints

Feed-forward took 0.011386632919311523 with 4 threads and 10 datapoints

Feed-forward took 0.03217887878417969 with 4 threads and 100 datapoints

Feed-forward took 25.566179275512695 with 4 threads and 1000 datapoints

Feed-forward took 0.011873006820678711 with 8 threads and 10 datapoints

Feed-forward took 0.030729293823242188 with 8 threads and 100 datapoints

Feed-forward took 17.558427095413208 with 8 threads and 1000 datapoints

Feed-forward took 0.012842893600463867 with 16 threads and 10 datapoints

Feed-forward took 0.03357887268066406 with 16 threads and 100 datapoints

Feed-forward took 17.48691177368164 with 16 threads and 1000 datapoints

Feed-forward took 0.012748003005981445 with 1 threads and 10 datapoints

Feed-forward took 0.09749650955200195 with 1 threads and 100 datapoints

Feed-forward took 92.39177441596985 with 1 threads and 1000 datapoints

Feed-forward took 0.012720108032226562 with 2 threads and 10 datapoints

Feed-forward took 0.05495858192443848 with 2 threads and 100 datapoints

Feed-forward took 44.13042688369751 with 2 threads and 1000 datapoints

Feed-forward took 0.013065099716186523 with 4 threads and 10 datapoints

Feed-forward took 0.04522275924682617 with 4 threads and 100 datapoints

Feed-forward took 27.389307022094727 with 4 threads and 1000 datapoints

Feed-forward took 0.013686418533325195 with 8 threads and 10 datapoints

Feed-forward took 0.032560110092163086 with 8 threads and 100 datapoints

Feed-forward took 17.700119256973267 with 8 threads and 1000 datapoints

Feed-forward took 0.012939929962158203 with 16 threads and 10 datapoints

Feed-forward took 0.03823733329772949 with 16 threads and 100 datapoints

Feed-forward took 17.4762065410614 with 16 threads and 1000 datapoints

Feed-forward took 0.012934684753417969 with 1 threads and 10 datapoints

Feed-forward took 0.13637399673461914 with 1 threads and 100 datapoints

Feed-forward took 91.13917708396912 with 1 threads and 1000 datapoints

Feed-forward took 0.014436006546020508 with 2 threads and 10 datapoints

Feed-forward took 0.05678200721740723 with 2 threads and 100 datapoints

Feed-forward took 43.62279033660889 with 2 threads and 1000 datapoints

Feed-forward took 0.014259099960327148 with 4 threads and 10 datapoints

Feed-forward took 0.03540492057800293 with 4 threads and 100 datapoints

Feed-forward took 28.423216104507446 with 4 threads and 1000 datapoints

Feed-forward took 0.015329360961914062 with 8 threads and 10 datapoints

Feed-forward took 0.03820395469665527 with 8 threads and 100 datapoints

Feed-forward took 18.106050968170166 with 8 threads and 1000 datapoints

Feed-forward took 0.01594066619873047 with 16 threads and 10 datapoints

Feed-forward took 0.03790640830993652 with 16 threads and 100 datapoints

Feed-forward took 17.475857973098755 with 16 threads and 1000 datapoints

Feed-forward took 0.014488458633422852 with 1 threads and 10 datapoints

Feed-forward took 0.10051918029785156 with 1 threads and 100 datapoints

Feed-forward took 87.23844242095947 with 1 threads and 1000 datapoints

Feed-forward took 0.016185522079467773 with 2 threads and 10 datapoints

Feed-forward took 0.05907082557678223 with 2 threads and 100 datapoints

Feed-forward took 43.58178663253784 with 2 threads and 1000 datapoints

Feed-forward took 0.016939640045166016 with 4 threads and 10 datapoints

Feed-forward took 0.0496068000793457 with 4 threads and 100 datapoints

Feed-forward took 31.100414991378784 with 4 threads and 1000 datapoints

Feed-forward took 0.0175018310546875 with 8 threads and 10 datapoints

Feed-forward took 0.03852081298828125 with 8 threads and 100 datapoints

Feed-forward took 17.533580780029297 with 8 threads and 1000 datapoints

Feed-forward took 0.017953872680664062 with 16 threads and 10 datapoints

Feed-forward took 0.04227566719055176 with 16 threads and 100 datapoints

Feed-forward took 17.470565795898438 with 16 threads and 1000 datapoints

Feed-forward took 0.016302108764648438 with 1 threads and 10 datapoints

Feed-forward took 0.1025083065032959 with 1 threads and 100 datapoints

Feed-forward took 88.26718306541443 with 1 threads and 1000 datapoints

Feed-forward took 0.01723647117614746 with 2 threads and 10 datapoints

Feed-forward took 0.05930066108703613 with 2 threads and 100 datapoints

Feed-forward took 43.64161229133606 with 2 threads and 1000 datapoints

Feed-forward took 0.018032550811767578 with 4 threads and 10 datapoints

Feed-forward took 0.038730621337890625 with 4 threads and 100 datapoints

Feed-forward took 25.92401099205017 with 4 threads and 1000 datapoints

Feed-forward took 0.018239259719848633 with 8 threads and 10 datapoints

Feed-forward took 0.03597688674926758 with 8 threads and 100 datapoints

Feed-forward took 17.446740865707397 with 8 threads and 1000 datapoints

Feed-forward took 0.02016282081604004 with 16 threads and 10 datapoints

Feed-forward took 0.04096698760986328 with 16 threads and 100 datapoints

Feed-forward took 17.4446542263031 with 16 threads and 1000 datapoints

In [33]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from matplotlib import pyplot as plt

plt.figure()
plt.suptitle("Feedforward", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("Timing With Ten by Ten Sized Matrices")
plt.plot(results[:-1,0])
plt.scatter([5],results[-1:,0])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,2)
plt.title("Timing With a Hundred by Hundred Sized Matrices")
plt.plot(results[:-1,1])
plt.scatter([5],results[-1:,1])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,3)
plt.title("Timing With a Thousand by a Thousand Sized Matrices")
plt.plot(results[:-1,2])
plt.scatter([5],results[-1:,2])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,4)
plt.title("All Timings In Log Scale")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.yscale("log")
plt.tight_layout()

plt.savefig("feedforward.pgf")
plt.show()
np.save("feedforwardtimings.npy",results)


Populating the interactive namespace from numpy and matplotlib

In [ ]:


In [ ]:


In [1]:
import tensorflow as tf
euclidean_dist_module = tf.load_op_library("euclidean_dist.so")
euclidean_dist = euclidean_dist_module.euclidean_dist
euclidean_dist_grad = euclidean_dist_module.euclidean_dist_grad

from tensorflow.python.framework import ops
@ops.RegisterGradient("EuclideanDist")
def _EuclideanDistGrad(op, grad):
    a = op.inputs[0]
    b = op.inputs[1]
    y = op.outputs[0]  # y = 0.5 * b / conj(a)
    xGrad, cGrad = euclidean_dist_grad(a,b,y,grad)
    return xGrad, cGrad

import time


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def RBFEuclidean(x, C, number_of_threads=1):
    """Computes distance from cluster centers defined in input C
    
    Both outdim and indim should be integers.
    """
    return -euclidean_dist(x,C,number_of_threads)

In [2]:
repeat_i_times = 10

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

sess = tf.InteractiveSession()

results = np.zeros([6,3])
for i in range(repeat_i_times):
    for k in range(1,4):
        x = tf.placeholder(tf.float32, shape=[None, 10**k])
        y_ = tf.placeholder(tf.float32, shape=[None, 10**k])

        W = weight_variable([10**k, 10**k])

        x_in = np.random.normal(0,1,[10**k,10**k])
        W_in = np.random.normal(0,1,[10**k,10**k])

        y_conv = RBFEuclidean(x, W)
        
        grad = tf.gradients(y_conv,[x,W])
        if True:
            start = time.time()
            tf.get_default_session().run(grad, feed_dict={x:x_in, W:W_in})
            elapsed_time = time.time()-start
            results[5,k-1] += elapsed_time/repeat_i_times
            print("\nBackprop took {} with GPU and {} datapoints".format(elapsed_time, 10**k))


Backprop took 0.10253262519836426 with GPU and 10 datapoints

Backprop took 0.0055501461029052734 with GPU and 100 datapoints

Backprop took 0.45310139656066895 with GPU and 1000 datapoints

Backprop took 0.004923820495605469 with GPU and 10 datapoints

Backprop took 0.005915403366088867 with GPU and 100 datapoints

Backprop took 0.4490547180175781 with GPU and 1000 datapoints

Backprop took 0.005160808563232422 with GPU and 10 datapoints

Backprop took 0.0061187744140625 with GPU and 100 datapoints

Backprop took 0.4521031379699707 with GPU and 1000 datapoints

Backprop took 0.0056247711181640625 with GPU and 10 datapoints

Backprop took 0.00650477409362793 with GPU and 100 datapoints

Backprop took 0.4496266841888428 with GPU and 1000 datapoints

Backprop took 0.006127357482910156 with GPU and 10 datapoints

Backprop took 0.00709223747253418 with GPU and 100 datapoints

Backprop took 0.45067405700683594 with GPU and 1000 datapoints

Backprop took 0.0064504146575927734 with GPU and 10 datapoints

Backprop took 0.007270097732543945 with GPU and 100 datapoints

Backprop took 0.4512162208557129 with GPU and 1000 datapoints

Backprop took 0.00681614875793457 with GPU and 10 datapoints

Backprop took 0.007638216018676758 with GPU and 100 datapoints

Backprop took 0.45655226707458496 with GPU and 1000 datapoints

Backprop took 0.010160684585571289 with GPU and 10 datapoints

Backprop took 0.011613607406616211 with GPU and 100 datapoints

Backprop took 0.45688962936401367 with GPU and 1000 datapoints

Backprop took 0.014014005661010742 with GPU and 10 datapoints

Backprop took 0.008870601654052734 with GPU and 100 datapoints

Backprop took 0.4524211883544922 with GPU and 1000 datapoints

Backprop took 0.008249759674072266 with GPU and 10 datapoints

Backprop took 0.008954048156738281 with GPU and 100 datapoints

Backprop took 0.4534881114959717 with GPU and 1000 datapoints

In [5]:
np.save("timings.npy",results)

In [3]:
results = np.load("timings.npy")

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np

sess = tf.InteractiveSession()
for i in range(repeat_i_times):
    for n in range(0,5):
        for k in range(1,4):
            x = tf.placeholder(tf.float32, shape=[None, 10**k])
            y_ = tf.placeholder(tf.float32, shape=[None, 10**k])

            W = weight_variable([10**k, 10**k])

            x_in = np.random.normal(0,1,[10**k,10**k])
            W_in = np.random.normal(0,1,[10**k,10**k])

            y_conv = RBFEuclidean(x, W, 2**n)
        
            grad = tf.gradients(y_conv,[x,W])
            if True:
                start = time.time()
                tf.get_default_session().run(grad, feed_dict={x:x_in, W:W_in})
                elapsed_time = time.time()-start
                results[n,k-1] += elapsed_time/repeat_i_times
                print("\nBackprop took {} with {} threads and {} datapoints".format(elapsed_time, 2**n, 10**k))


Backprop took 0.004094123840332031 with 1 threads and 10 datapoints

Backprop took 0.2740468978881836 with 1 threads and 100 datapoints

Backprop took 204.35610055923462 with 1 threads and 1000 datapoints

Backprop took 0.004304409027099609 with 2 threads and 10 datapoints

Backprop took 0.15582704544067383 with 2 threads and 100 datapoints

Backprop took 159.93707871437073 with 2 threads and 1000 datapoints

Backprop took 0.004886150360107422 with 4 threads and 10 datapoints

Backprop took 0.1363816261291504 with 4 threads and 100 datapoints

Backprop took 144.11951208114624 with 4 threads and 1000 datapoints

Backprop took 0.0053904056549072266 with 8 threads and 10 datapoints

Backprop took 0.13651108741760254 with 8 threads and 100 datapoints

Backprop took 133.5270013809204 with 8 threads and 1000 datapoints

Backprop took 0.006148576736450195 with 16 threads and 10 datapoints

Backprop took 0.13579940795898438 with 16 threads and 100 datapoints

Backprop took 131.45001792907715 with 16 threads and 1000 datapoints

Backprop took 0.005733013153076172 with 1 threads and 10 datapoints

Backprop took 0.20072674751281738 with 1 threads and 100 datapoints

Backprop took 205.21843361854553 with 1 threads and 1000 datapoints

Backprop took 0.0063893795013427734 with 2 threads and 10 datapoints

Backprop took 0.15763378143310547 with 2 threads and 100 datapoints

Backprop took 162.17705726623535 with 2 threads and 1000 datapoints

Backprop took 0.006818056106567383 with 4 threads and 10 datapoints

Backprop took 0.14864802360534668 with 4 threads and 100 datapoints

Backprop took 142.08267498016357 with 4 threads and 1000 datapoints

Backprop took 0.007290840148925781 with 8 threads and 10 datapoints

Backprop took 0.13888955116271973 with 8 threads and 100 datapoints

Backprop took 134.99269032478333 with 8 threads and 1000 datapoints

Backprop took 0.008765697479248047 with 16 threads and 10 datapoints

Backprop took 0.14610743522644043 with 16 threads and 100 datapoints

Backprop took 131.87644791603088 with 16 threads and 1000 datapoints

Backprop took 0.008007526397705078 with 1 threads and 10 datapoints

Backprop took 0.2025744915008545 with 1 threads and 100 datapoints

Backprop took 205.1317493915558 with 1 threads and 1000 datapoints

Backprop took 0.009347200393676758 with 2 threads and 10 datapoints

Backprop took 0.16611552238464355 with 2 threads and 100 datapoints

Backprop took 159.69919157028198 with 2 threads and 1000 datapoints

Backprop took 0.009036779403686523 with 4 threads and 10 datapoints

Backprop took 0.1401972770690918 with 4 threads and 100 datapoints

Backprop took 142.50203156471252 with 4 threads and 1000 datapoints

Backprop took 0.03990650177001953 with 8 threads and 10 datapoints

Backprop took 0.14565110206604004 with 8 threads and 100 datapoints

Backprop took 135.12911939620972 with 8 threads and 1000 datapoints

Backprop took 0.0113677978515625 with 16 threads and 10 datapoints

Backprop took 0.1410965919494629 with 16 threads and 100 datapoints

Backprop took 131.91143107414246 with 16 threads and 1000 datapoints

Backprop took 0.010936737060546875 with 1 threads and 10 datapoints

Backprop took 0.20449495315551758 with 1 threads and 100 datapoints

Backprop took 206.40549993515015 with 1 threads and 1000 datapoints

Backprop took 0.010952472686767578 with 2 threads and 10 datapoints

Backprop took 0.1628403663635254 with 2 threads and 100 datapoints

Backprop took 159.99777913093567 with 2 threads and 1000 datapoints

Backprop took 0.011780977249145508 with 4 threads and 10 datapoints

Backprop took 0.152374267578125 with 4 threads and 100 datapoints

Backprop took 143.0022759437561 with 4 threads and 1000 datapoints

Backprop took 0.01102590560913086 with 8 threads and 10 datapoints

Backprop took 0.15091776847839355 with 8 threads and 100 datapoints

Backprop took 132.65297102928162 with 8 threads and 1000 datapoints

Backprop took 0.012123346328735352 with 16 threads and 10 datapoints

Backprop took 0.15025973320007324 with 16 threads and 100 datapoints

Backprop took 134.97525811195374 with 16 threads and 1000 datapoints

Backprop took 0.012554645538330078 with 1 threads and 10 datapoints

Backprop took 0.20638775825500488 with 1 threads and 100 datapoints

Backprop took 204.1528069972992 with 1 threads and 1000 datapoints

Backprop took 0.014051437377929688 with 2 threads and 10 datapoints

Backprop took 0.16444659233093262 with 2 threads and 100 datapoints

Backprop took 163.8159146308899 with 2 threads and 1000 datapoints

Backprop took 0.012989044189453125 with 4 threads and 10 datapoints

Backprop took 0.14309382438659668 with 4 threads and 100 datapoints

Backprop took 145.77241921424866 with 4 threads and 1000 datapoints

Backprop took 0.014562606811523438 with 8 threads and 10 datapoints

Backprop took 0.14074110984802246 with 8 threads and 100 datapoints

Backprop took 132.21886682510376 with 8 threads and 1000 datapoints

Backprop took 0.014961004257202148 with 16 threads and 10 datapoints

Backprop took 0.143324613571167 with 16 threads and 100 datapoints

Backprop took 132.91949677467346 with 16 threads and 1000 datapoints

Backprop took 0.014765501022338867 with 1 threads and 10 datapoints

Backprop took 0.22594714164733887 with 1 threads and 100 datapoints

Backprop took 204.72345209121704 with 1 threads and 1000 datapoints

Backprop took 0.014847993850708008 with 2 threads and 10 datapoints

Backprop took 0.1656789779663086 with 2 threads and 100 datapoints

Backprop took 161.70656633377075 with 2 threads and 1000 datapoints

Backprop took 0.015531778335571289 with 4 threads and 10 datapoints

Backprop took 0.15711045265197754 with 4 threads and 100 datapoints

Backprop took 147.42548036575317 with 4 threads and 1000 datapoints

Backprop took 0.015822649002075195 with 8 threads and 10 datapoints

Backprop took 0.15778470039367676 with 8 threads and 100 datapoints

Backprop took 136.97084403038025 with 8 threads and 1000 datapoints

Backprop took 0.017802000045776367 with 16 threads and 10 datapoints

Backprop took 0.14468598365783691 with 16 threads and 100 datapoints

Backprop took 136.07438802719116 with 16 threads and 1000 datapoints

Backprop took 0.01599287986755371 with 1 threads and 10 datapoints

Backprop took 0.20919084548950195 with 1 threads and 100 datapoints

Backprop took 202.39225935935974 with 1 threads and 1000 datapoints

Backprop took 0.016911029815673828 with 2 threads and 10 datapoints

Backprop took 0.16654562950134277 with 2 threads and 100 datapoints

Backprop took 160.2892985343933 with 2 threads and 1000 datapoints

Backprop took 0.015943527221679688 with 4 threads and 10 datapoints

Backprop took 0.1475200653076172 with 4 threads and 100 datapoints

Backprop took 144.0751438140869 with 4 threads and 1000 datapoints

Backprop took 0.01691436767578125 with 8 threads and 10 datapoints

Backprop took 0.14827990531921387 with 8 threads and 100 datapoints

Backprop took 133.3203661441803 with 8 threads and 1000 datapoints

Backprop took 0.01748967170715332 with 16 threads and 10 datapoints

Backprop took 0.1465156078338623 with 16 threads and 100 datapoints

Backprop took 134.18065786361694 with 16 threads and 1000 datapoints

Backprop took 0.018362998962402344 with 1 threads and 10 datapoints

Backprop took 0.21236538887023926 with 1 threads and 100 datapoints

Backprop took 202.32116174697876 with 1 threads and 1000 datapoints

Backprop took 0.01821160316467285 with 2 threads and 10 datapoints

Backprop took 0.2044358253479004 with 2 threads and 100 datapoints

Backprop took 162.02967929840088 with 2 threads and 1000 datapoints

Backprop took 0.01882314682006836 with 4 threads and 10 datapoints

Backprop took 0.1491868495941162 with 4 threads and 100 datapoints

Backprop took 144.11870908737183 with 4 threads and 1000 datapoints

Backprop took 0.020158052444458008 with 8 threads and 10 datapoints

Backprop took 0.1500082015991211 with 8 threads and 100 datapoints

Backprop took 134.4871220588684 with 8 threads and 1000 datapoints

Backprop took 0.020169973373413086 with 16 threads and 10 datapoints

Backprop took 0.15879583358764648 with 16 threads and 100 datapoints

Backprop took 132.8937394618988 with 16 threads and 1000 datapoints

Backprop took 0.01919722557067871 with 1 threads and 10 datapoints

Backprop took 0.21400690078735352 with 1 threads and 100 datapoints

Backprop took 203.63754081726074 with 1 threads and 1000 datapoints

Backprop took 0.02272176742553711 with 2 threads and 10 datapoints

Backprop took 0.17065834999084473 with 2 threads and 100 datapoints

Backprop took 160.6747760772705 with 2 threads and 1000 datapoints

Backprop took 0.02127981185913086 with 4 threads and 10 datapoints

Backprop took 0.15175485610961914 with 4 threads and 100 datapoints

Backprop took 143.90283465385437 with 4 threads and 1000 datapoints

Backprop took 0.02323436737060547 with 8 threads and 10 datapoints

Backprop took 0.15219569206237793 with 8 threads and 100 datapoints

Backprop took 134.1761496067047 with 8 threads and 1000 datapoints

Backprop took 0.03505444526672363 with 16 threads and 10 datapoints

Backprop took 0.1750776767730713 with 16 threads and 100 datapoints

Backprop took 132.98229932785034 with 16 threads and 1000 datapoints

Backprop took 0.021880626678466797 with 1 threads and 10 datapoints

Backprop took 0.21570754051208496 with 1 threads and 100 datapoints

Backprop took 204.94635438919067 with 1 threads and 1000 datapoints

Backprop took 0.024300336837768555 with 2 threads and 10 datapoints

Backprop took 0.17359495162963867 with 2 threads and 100 datapoints

Backprop took 160.61314988136292 with 2 threads and 1000 datapoints

Backprop took 0.023850202560424805 with 4 threads and 10 datapoints

Backprop took 0.16777300834655762 with 4 threads and 100 datapoints

Backprop took 146.12919998168945 with 4 threads and 1000 datapoints

Backprop took 0.025606870651245117 with 8 threads and 10 datapoints

Backprop took 0.15977978706359863 with 8 threads and 100 datapoints

Backprop took 134.21318459510803 with 8 threads and 1000 datapoints

Backprop took 0.02656078338623047 with 16 threads and 10 datapoints

Backprop took 0.15356063842773438 with 16 threads and 100 datapoints

Backprop took 134.84589433670044 with 16 threads and 1000 datapoints

In [5]:
%pylab inline
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from matplotlib import pyplot as plt

plt.figure()
plt.suptitle("Backpropagation", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("Timing With Ten by Ten Sized Matrices")
plt.plot(results[:-1,0])
plt.scatter([5],results[-1:,0])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,2)
plt.title("Timing With a Hundred by Hundred Sized Matrices")
plt.plot(results[:-1,1])
plt.scatter([5],results[-1:,1])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,3)
plt.title("Timing With a Thousand by a Thousand Sized Matrices")
plt.plot(results[:-1,2])
plt.scatter([5],results[-1:,2])
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")

plt.subplot(2,2,4)
plt.title("All Timings In Log Scale")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Seconds to compute one calculation")
plt.yscale("log")

plt.tight_layout()

plt.savefig("backprop.pgf")
plt.show()
np.save("backproptimings.npy",results)


Populating the interactive namespace from numpy and matplotlib

In [108]:
thread_counts = np.array([1,2,4,8,16,32*256])
timings = np.load("feedforwardtimings.npy")
results = timings[0,None]/timings
plt.figure()
plt.suptitle("Feedforward", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("All Speed Ups")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Speed up ratio on one calculation (log scale)")
plt.yscale("log")
plt.legend(["10x10","100x100","1000x1000"],loc=2)

plt.subplot(2,2,2)
plt.title("CPU Only Speed Ups")
plt.plot(results[:-1]-1)
plt.xticks(range(-1,6),["","1", "2", "4", "8", "16", ""])
plt.xlabel("Number of threads")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=2)

plt.subplot(2,2,3)
plt.title("Speed Up Per Thread")
plt.plot((results[1:-1]-1)/thread_counts[1:-1,None])
plt.scatter([4],(results[-1:,0]-1)/thread_counts[-1],color="blue")
plt.scatter([4],(results[-1:,1]-1)/thread_counts[-1], color="green")
plt.scatter([4],(results[-1:,2]-1)/thread_counts[-1], color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=1)

plt.subplot(2,2,4)
def amdahlPortion(speedup,threads):
    return threads*(speedup-1)/((threads-1)*speedup)
plt.title("Amdahl's Law Calculated Parallelizable Portion")
plt.plot(amdahlPortion(results[1:-1],thread_counts[1:-1,None]))
plt.scatter([4],amdahlPortion(results[-1:,0],thread_counts[-1]),color="blue")
plt.scatter([4],amdahlPortion(results[-1:,1],thread_counts[-1]), color="green")
plt.scatter([4],amdahlPortion(results[-1:,2],thread_counts[-1]), color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Ratio of parallelizable code to total code")
plt.legend(["10x10","100x100","1000x1000"],loc=10)

plt.tight_layout()
plt.savefig("feedforward2.pgf")
plt.show()



In [91]:
amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None])


Out[91]:
array([[ 0.94244669,  1.00064302],
       [ 0.84315011,  0.92640745],
       [ 0.77287544,  0.91731113]])

In [92]:
amdahlPortion(results[1:-2,1],thread_counts[1:-2])


Out[92]:
array([ 0.94244669,  0.84315011,  0.77287544])

In [93]:
amdahlPortion(results[1:-2,2],thread_counts[1:-2])


Out[93]:
array([ 1.00064302,  0.92640745,  0.91731113])

In [94]:
amdahlPortion(results[-1,1:],thread_counts[-1])


Out[94]:
array([ 0.94731871,  0.99711002])

In [95]:
np.average(amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None]))


Out[95]:
0.9004723065515009

In [96]:
np.average(amdahlPortion(results[1:-2,1],thread_counts[1:-2]))


Out[96]:
0.85282408014801414

In [97]:
np.average(amdahlPortion(results[1:-2,2],thread_counts[1:-2]))


Out[97]:
0.94812053295498755

In [98]:
np.average(amdahlPortion(results[-1,1:],thread_counts[-1]))


Out[98]:
0.97221436787683058

In [109]:
thread_counts = np.array([1,2,4,8,16,32*256])
timings = np.load("backproptimings.npy")
results = timings[0,None]/timings
plt.figure()
plt.suptitle("Backpropagation", fontsize=24, y=1.05)
plt.subplot(2,2,1)
plt.title("All Speed Ups")
plt.plot(results[:-1])
plt.scatter([5],results[-1:,0],color="blue")
plt.scatter([5],results[-1:,1], color="green")
plt.scatter([5],results[-1:,2], color="red")
plt.xticks(range(-1,7),["","1", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Speed up ratio on one calculation (log scale)")
plt.yscale("log")
plt.legend(["10x10","100x100","1000x1000"],loc=2)

plt.subplot(2,2,2)
plt.title("CPU Only Speed Ups")
plt.plot(results[:-1]-1)
plt.xticks(range(-1,6),["","1", "2", "4", "8", "16", ""])
plt.xlabel("Number of threads")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=5)

plt.subplot(2,2,3)
plt.title("Speed Up Per Thread")
plt.plot((results[1:-1]-1)/thread_counts[1:-1,None])
plt.scatter([4],(results[-1:,0]-1)/thread_counts[-1],color="blue")
plt.scatter([4],(results[-1:,1]-1)/thread_counts[-1], color="green")
plt.scatter([4],(results[-1:,2]-1)/thread_counts[-1], color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Relative speed difference on one calculation")
plt.legend(["10x10","100x100","1000x1000"],loc=1)

plt.subplot(2,2,4)
def amdahlPortion(speedup,threads):
    return threads*(speedup-1)/((threads-1)*speedup)
plt.title("Amdahl's Law Calculated Parallelizable Portion")
plt.plot(amdahlPortion(results[1:-1],thread_counts[1:-1,None]))
plt.scatter([4],amdahlPortion(results[-1:,0],thread_counts[-1]),color="blue")
plt.scatter([4],amdahlPortion(results[-1:,1],thread_counts[-1]), color="green")
plt.scatter([4],amdahlPortion(results[-1:,2],thread_counts[-1]), color="red")
plt.xticks(range(-1,6),["", "2", "4", "8", "16", "GPU", ""])
plt.xlabel("Number of threads (or GPU)")
plt.ylabel("Ratio of parallelizable code to total code")
plt.legend(["10x10","100x100","1000x1000"],loc=2)

plt.tight_layout()
plt.savefig("backprop2.pgf")
plt.show()



In [100]:
amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None])


Out[100]:
array([[ 0.44117566,  0.42318599],
       [ 0.41340681,  0.39162752],
       [ 0.36135818,  0.39241959]])

In [101]:
amdahlPortion(results[1:-2,1],thread_counts[1:-2])


Out[101]:
array([ 0.44117566,  0.41340681,  0.36135818])

In [102]:
amdahlPortion(results[1:-2,2],thread_counts[1:-2])


Out[102]:
array([ 0.42318599,  0.39162752,  0.39241959])

In [103]:
amdahlPortion(results[-1,1:],thread_counts[-1])


Out[103]:
array([ 0.96523919,  0.99790718])

In [104]:
np.average(amdahlPortion(results[1:-2,1:],thread_counts[1:-2,None]))


Out[104]:
0.40386228929710644

In [105]:
np.average(amdahlPortion(results[1:-2,1],thread_counts[1:-2]))


Out[105]:
0.40531354866514063

In [106]:
np.average(amdahlPortion(results[1:-2,2],thread_counts[1:-2]))


Out[106]:
0.40241102992907224

In [107]:
np.average(amdahlPortion(results[-1,1:],thread_counts[-1]))


Out[107]:
0.98157318393468651

In [ ]: