In [6]:
import os
import sys
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
import tensorflow as tf
import time

n = 8192
dtype = tf.float32
with tf.device("/gpu:0"):
    matrix1 = tf.Variable(tf.ones((n, n), dtype=dtype))
    matrix2 = tf.Variable(tf.ones((n, n), dtype=dtype))
    product = tf.matmul(matrix1, matrix2)


# avoid optimizing away redundant nodes
config = tf.ConfigProto(graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
sess = tf.Session(config=config)

sess.run(tf.global_variables_initializer())
iters = 10

# pre-warming
sess.run(product.op)

start = time.time()
for i in range(iters):
  sess.run(product.op)
end = time.time()
ops = n**3 + (n-1)*n**2 # n^2*(n-1) additions, n^3 multiplications
elapsed = (end - start)
rate = iters*ops/elapsed/10**9
print('\n %d x %d matmul took: %.2f sec, %.2f G ops/sec' % (n, n,
                                                            elapsed/iters,
                                                            rate,))


 8192 x 8192 matmul took: 0.27 sec, 4139.07 G ops/sec

In [3]:
#https://stackoverflow.com/questions/41804380/testing-gpu-with-tensorflow-matrix-multiplication

In [4]:
#https://github.com/yaroslavvb/stuff/blob/master/gpu-memory-transfer.ipynb

In [5]:
#https://github.com/yaroslavvb/stuff/blob/master/gpu_svd_bench.py

In [10]:
import numpy as np

a = np.ones(8192*8192).astype(np.float32).reshape(8192, 8192)
b = a.copy()

In [11]:
%timeit a @ b


5.75 s ± 99.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [ ]: