Workflow

  1. ipyparallel - launch single gpu process
  2. perform computations with %%px magic
  3. do a=view['a'] for all variables
  4. analyse on frontend machine

In [ ]:
import numpy as np

In [1]:
import ipyparallel as ipp

In [ ]:
# c = ipp.Client(profile='mpi',sshserver='marcin.kostur@155.158.128.149')

In [ ]:
# c = ipp.Client(profile='default',sshserver='marcin.kostur@155.158.128.148')

In [ ]:
#c = ipp.Client(profile='mpi',sshserver='marcin.kostur@155.158.128.98')

In [ ]:
c = ipp.Client(profile='mpi')

In [4]:
c = ipp.Client(profile='gpu4')

In [5]:
c.ids


Out[5]:
[0]

In [6]:
view = c[:]

In [7]:
view.activate()

In [8]:
%%px --block
import socket
print(socket.gethostbyname(socket.gethostname()))


[stdout:0] 155.158.128.98

In [9]:
%%px --block
import random
a=random.random()

In [10]:
view['a']


Out[10]:
[0.16928542287577342]

In [11]:
view['a'] = 2.0

In [ ]:
%%px --block
import sys
sys.version

In [16]:
%%px --block

import sys
print(sys.version)


[stdout:0] 
3.5.2 | packaged by conda-forge | (default, Jul 26 2016, 01:32:08) 
[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]

In [ ]:


In [ ]:


In [ ]:
import socket
socket.gethostbyname(socket.gethostname())

In [ ]:


In [12]:
%%px --block

import numpy as np 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""
    __global__ void kernel(float *a)
    {
      int idx = threadIdx.x + blockDim.x*blockIdx.x;
    
      if(threadIdx.x>2)
          a[idx] = 444.0f;
    }
    """)

a = np.zeros(10).astype(np.float32)
func = mod.get_function("kernel")
print (np.linspace(0,9,10))
print ("----------------")
print (a)
func(cuda.InOut(a),block=(5,1,1),grid=(2,1,1))
print (a)


[stdout:0] 
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
----------------
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[   0.    0.    0.  444.  444.    0.    0.    0.  444.  444.]

In [13]:
view['a'][0]


Out[13]:
array([   0.,    0.,    0.,  444.,  444.,    0.,    0.,    0.,  444.,  444.], dtype=float32)

In [ ]:
%%px --block
import numpy as np 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.curandom import rand as curand
cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
Nx = 1000
Na = 1000

a = np.linspace(3.255,4,Na).astype(np.float32)
a =  np.repeat(a,Nx)

a_gpu = gpuarray.to_gpu(a)
x_gpu = curand((Na*Nx,))

from pycuda.elementwise import ElementwiseKernel
iterate = ElementwiseKernel(
        "float *a, float *x",
        "x[i] = a[i]*x[i]*(1.0f-x[i])",
        "iterate")
Niter = 10000
for i in range(Niter):
    iterate(a_gpu,x_gpu)
ctx.synchronize()
print (Niter*Na*Nx/1e9)
a,x = a_gpu.get(),x_gpu.get()

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
a=view['a'][0]
x=view['x'][0]

In [ ]:
plt.figure(num=1, figsize=(30, 20))
every = 1
plt.plot(a[::every],x[::every],'ob',markersize=1,alpha=0.1)
plt.plot([3.83,3.83],[0,1])

In [ ]:


In [ ]:


In [ ]:


In [ ]:
view.block = True

In [ ]:
%autopx

In [ ]:
import socket
print(socket.gethostbyname(socket.gethostname()))
import sys
sys.version

In [ ]:


In [ ]:
import pycuda.gpuarray as gpuarray

from pycuda.curandom import rand as curand
from pycuda.compiler import SourceModule
import pycuda.driver as cuda

try:
    ctx.pop()
    ctx.detach()
except:
    print ("No CTX!")

cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
print (device.name(), device.compute_capability(),device.total_memory()/1024.**3,"GB")
print ("a tak wogóle to mamy tu:",cuda.Device.count(), " urządzenia")

In [ ]:
%px from ipyparallel import bind_kernel; bind_kernel()

In [ ]:
import sys
sys.version

In [ ]:
%matplotlib inline

In [ ]:
%%px --block
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.plot(np.random.randn(1000),np.random.randn(1000),'yo')

In [ ]:
%%px --block
%%sh
pwd

In [ ]:
%%sh
pwd

In [ ]:
c = ipp.Client(profile='default')

In [ ]:


In [ ]:
c.ids

In [ ]:
dview = c[:]

activate - make magics use the curent view


In [ ]:
%%px
%matplotlib inline
from pylab import *
plot([1,2,9],[2,9,3],'o-')

In [ ]:
%pxresult

In [ ]:


In [ ]:

@dview.remote


In [ ]:
@dview.remote(block=True)
def getpid():
    import os
    return os.getpid()

In [ ]:
getpid()

In [ ]:
@dview.remote(block=True)
def iterate_logistic(a,N,Niter):
    import numpy as np 
    x = np.random.random(N)
    for i in range(Niter):
        x = a*x*(1.0-x)
    return x

In [ ]:


In [ ]:
iterate_logistic(4.0,1,10000)

In [ ]:

@dview.parallel


In [ ]:
import numpy as np
x = np.random.random((2**12))
print(x.nbytes/1024**2,x.size)
a = np.ones_like(x)*4.0

In [ ]:
@dview.parallel(block=True)
def iterate_logistic(x,a):
    import numpy as np 
    x = np.copy(x)
    for i in range(100000):
        x = a*x*(1.0-x)
    return x

In [ ]:
%%time
for i in range(100000):
    x = a*x*(1.0-x)

In [ ]:
%time x = iterate_logistic(x,a)

In [ ]:


In [ ]:


In [ ]:
dview?

In [ ]:
import ipyparallel.client.remotefunction
ipyparallel.client.remotefunction.ParallelFunction?

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
c = ipp.Client(profile='mpi')

In [ ]:
c.ids

In [ ]:
c[:].apply_sync(lambda : "Hello, World")

Interesting might be:

c = Client('/path/to/my/ipcontroller-client.json', sshserver='me@myhub.example.com')

In [ ]:
import mpi4py

In [ ]:
%%px

In [ ]:
%%writefile psum.py
from mpi4py import MPI
import numpy as np

def psum(a):
    locsum = np.sum(a)
    rcvBuf = np.array(0.0,'d')
    MPI.COMM_WORLD.Allreduce([locsum, MPI.DOUBLE],
        [rcvBuf, MPI.DOUBLE],
        op=MPI.SUM)
    return rcvBuf

In [ ]:
view = c[:]

In [ ]:
view.activate()

In [ ]:
view.run('psum.py')

In [ ]:
view.scatter('a',np.arange(16,dtype='float'))

In [ ]:
view['a']

In [ ]:
%px totalsum = psum(a)

In [ ]:
view['totalsum']

In [ ]:


In [ ]:
%%writefile psum.py
from mpi4py import MPI
import numpy as np

import time 
def psum(a):
    locsum = np.sum(a)
    rcvBuf = np.array(0.0,'d')
    MPI.COMM_WORLD.Allreduce([locsum, MPI.DOUBLE],
        [rcvBuf, MPI.DOUBLE],
        op=MPI.SUM)
    return rcvBuf

rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
if rank==0:
    a = np.arange(16,dtype='float')
else:
    a = None
a_local = np.empty(16/size,dtype='float')   
MPI.COMM_WORLD.Scatter( a, a_local )
time.sleep(rank*0.1)
print(rank,":::",a_local)

totalsum = psum(a_local)
print(totalsum)

In [ ]:
!mpirun -n 4 /opt/conda/envs/py27/bin/python psum.py

In [ ]:


In [ ]: