In [3]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np

In [4]:
import tensorflow as tf
tf.__version__


Out[4]:
'1.0.0'

In [ ]:


In [ ]:

Performance:

  • ultra optimal logistic - ~200 Giter/s
  • ElementwiseKernel - 10+ Giter/s
  • CPU single core - 0.3 Giter/s
  • Tensorflow @GPU 5Giter/s
  • Tensorflow @CPU (20cores) 0.5Giter/s

In [7]:
import pycuda.gpuarray as gpuarray

from pycuda.curandom import rand as curand
from pycuda.compiler import SourceModule
import pycuda.driver as cuda

try:
    ctx.pop()
    ctx.detach()
except:
    print( "No CTX!")

cuda.init()
device = cuda.Device(1)
ctx = device.make_context()


mod = SourceModule("""
    __global__ void logistic_iterations(float *a,float *x,int Niter)
    {
      
      int idx = threadIdx.x + blockDim.x*blockIdx.x;
      float a_ = a[idx];
      float x_ = x[idx];
      int i;
      for (i=0;i<Niter;i++){
        
          x_ = a_*x_*(1-x_);
      }
      
      x[idx] = x_;
    }
    """)
logistic_iterations = mod.get_function("logistic_iterations")


No CTX!

In [25]:
block_size=128
Nx = 1024
Na = 1024
blocks = Nx*Na//block_size

a = np.linspace(3.255,4,Na).astype(np.float32)
a = np.repeat(a,Nx)
x = np.random.rand(Na*Nx).astype(np.float32)

In [26]:
a_gpu = gpuarray.to_gpu(a)
#%time x_gpu = curand((Na*Nx,))
%time x_gpu = gpuarray.to_gpu(x)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.49 ms

In [10]:
%%time
logistic_iterations(a_gpu,x_gpu, np.int32(1000),block=(block_size,1,1), grid=(blocks,1,1))
ctx.synchronize()


CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 6.12 ms

In [27]:
%%time
for i in range(1000):
    logistic_iterations(a_gpu,x_gpu, np.int32(1),block=(block_size,1,1), grid=(blocks,1,1))
ctx.synchronize()


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 560 µs

In [12]:
%%time 
for i in range(1000):
    x_gpu = a_gpu*x_gpu*(1-x_gpu)


CPU times: user 516 ms, sys: 732 ms, total: 1.25 s
Wall time: 1.53 s

In [13]:
import numexpr as ne

In [14]:
%%time 
for i in range(1000):
    x = ne.evaluate("a*x*(1-x)")


CPU times: user 5.65 s, sys: 124 ms, total: 5.78 s
Wall time: 968 ms

In [15]:
%%time 
for i in range(1000):
    x = a*x*(1-x)


CPU times: user 1.88 s, sys: 8 ms, total: 1.88 s
Wall time: 1.88 s

In [16]:
%%time 
for i in range(1000):
    x *= (1.0-x)
    x *= a


CPU times: user 1.48 s, sys: 0 ns, total: 1.48 s
Wall time: 1.47 s

In [28]:
a,x = a_gpu.get(),x_gpu.get()

In [29]:
H, xedges, yedges = np.histogram2d(a,x,bins=(Na,500))
plt.figure(num=1, figsize=(10,5))
plt.imshow(1-np.log(H.T+5e-1),origin='lower',cmap='gray')


Out[29]:
<matplotlib.image.AxesImage at 0x7f54503b0160>

In [ ]:

Logistic with tf


In [9]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np 

import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

Nx = 10240
Na = 1024

a = np.linspace(3.255,4,Na).astype(np.float32)
a = np.repeat(a,Nx)
x = np.random.rand(Na*Nx).astype(np.float32)

Pętla zewnętrzna


In [10]:
with tf.device('/gpu:0'):
    X = tf.Variable(x)
    A = tf.constant(a)

    X_ = A*X*(1.0-X)
    step = tf.group( X.assign(X_) ) # ?with or not?

In [11]:
sess = tf.InteractiveSession(config=config)
init_op = tf.global_variables_initializer()
sess.run(init_op)

In [12]:
step = tf.group( X.assign(X_) )

In [13]:
%%time 
for i in range(1000):
    step.run()


CPU times: user 3.25 s, sys: 120 ms, total: 3.37 s
Wall time: 2.76 s

tf.while_loop


In [14]:
with tf.device('/gpu:0'):
    X = tf.Variable(x)
    A = tf.constant(a)

    def c(i,x):
        return  tf.less(i, 1000) 

    def b(i,x):
        x = A*x*(1-x)
        return tf.add(i, 1),x

    r = tf.while_loop(c, b, (tf.constant(0),X))

In [15]:
init_op = tf.global_variables_initializer()
with tf.Session(config=config) as sess:
    sess.run(init_op)
    %time iterations,x = sess.run(r)


CPU times: user 1.8 s, sys: 364 ms, total: 2.17 s
Wall time: 1.94 s

In [ ]:

WRONG !!!!

This creates huuuge graph

for i in range(100):
        sess.run( X.assign(X_))

Visualization


In [12]:
H, xedges, yedges = np.histogram2d(a,x,bins=(Na,500))

plt.figure(num=1, figsize=(10,5))
plt.imshow(1-np.log(H.T+5e-1),origin='lower',cmap='gray')

In [ ]:


In [ ]:


In [6]: