Workflow

ipyparallel - launch single gpu process
perform computations with %%px magic
do a=view['a'] for all variables
analyse on frontend machine



In [ ]:

    
import numpy as np



In [1]:

    
import ipyparallel as ipp



In [ ]:

    
# c = ipp.Client(profile='mpi',sshserver='marcin.kostur@155.158.128.149')



In [ ]:

    
# c = ipp.Client(profile='default',sshserver='marcin.kostur@155.158.128.148')



In [ ]:

    
#c = ipp.Client(profile='mpi',sshserver='marcin.kostur@155.158.128.98')



In [ ]:

    
c = ipp.Client(profile='mpi')



In [4]:

    
c = ipp.Client(profile='gpu4')



In [5]:

    
c.ids









    Out[5]:





[0]



In [6]:

    
view = c[:]



In [7]:

    
view.activate()



In [8]:

    
%%px --block
import socket
print(socket.gethostbyname(socket.gethostname()))









    



[stdout:0] 155.158.128.98



In [9]:

    
%%px --block
import random
a=random.random()



In [10]:

    
view['a']









    Out[10]:





[0.16928542287577342]



In [11]:

    
view['a'] = 2.0



In [ ]:

    
%%px --block
import sys
sys.version



In [16]:

    
%%px --block

import sys
print(sys.version)









    



[stdout:0] 
3.5.2 | packaged by conda-forge | (default, Jul 26 2016, 01:32:08) 
[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]



In [ ]:



In [ ]:



In [ ]:

    
import socket
socket.gethostbyname(socket.gethostname())



In [ ]:



In [12]:

    
%%px --block

import numpy as np 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""
    __global__ void kernel(float *a)
    {
      int idx = threadIdx.x + blockDim.x*blockIdx.x;
    
      if(threadIdx.x>2)
          a[idx] = 444.0f;
    }
    """)

a = np.zeros(10).astype(np.float32)
func = mod.get_function("kernel")
print (np.linspace(0,9,10))
print ("----------------")
print (a)
func(cuda.InOut(a),block=(5,1,1),grid=(2,1,1))
print (a)









    



[stdout:0] 
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
----------------
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[   0.    0.    0.  444.  444.    0.    0.    0.  444.  444.]



In [13]:

    
view['a'][0]









    Out[13]:





array([   0.,    0.,    0.,  444.,  444.,    0.,    0.,    0.,  444.,  444.], dtype=float32)



In [ ]:

    
%%px --block
import numpy as np 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.curandom import rand as curand
cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
Nx = 1000
Na = 1000

a = np.linspace(3.255,4,Na).astype(np.float32)
a =  np.repeat(a,Nx)

a_gpu = gpuarray.to_gpu(a)
x_gpu = curand((Na*Nx,))

from pycuda.elementwise import ElementwiseKernel
iterate = ElementwiseKernel(
        "float *a, float *x",
        "x[i] = a[i]*x[i]*(1.0f-x[i])",
        "iterate")
Niter = 10000
for i in range(Niter):
    iterate(a_gpu,x_gpu)
ctx.synchronize()
print (Niter*Na*Nx/1e9)
a,x = a_gpu.get(),x_gpu.get()



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt
a=view['a'][0]
x=view['x'][0]



In [ ]:

    
plt.figure(num=1, figsize=(30, 20))
every = 1
plt.plot(a[::every],x[::every],'ob',markersize=1,alpha=0.1)
plt.plot([3.83,3.83],[0,1])



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
view.block = True



In [ ]:

    
%autopx



In [ ]:

    
import socket
print(socket.gethostbyname(socket.gethostname()))
import sys
sys.version



In [ ]:



In [ ]:

    
import pycuda.gpuarray as gpuarray

from pycuda.curandom import rand as curand
from pycuda.compiler import SourceModule
import pycuda.driver as cuda

try:
    ctx.pop()
    ctx.detach()
except:
    print ("No CTX!")

cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
print (device.name(), device.compute_capability(),device.total_memory()/1024.**3,"GB")
print ("a tak wogóle to mamy tu:",cuda.Device.count(), " urządzenia")



In [ ]:

    
%px from ipyparallel import bind_kernel; bind_kernel()



In [ ]:

    
import sys
sys.version



In [ ]:

    
%matplotlib inline



In [ ]:

    
%%px --block
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.plot(np.random.randn(1000),np.random.randn(1000),'yo')



In [ ]:

    
%%px --block
%%sh
pwd



In [ ]:

    
%%sh
pwd



In [ ]:

    
c = ipp.Client(profile='default')



In [ ]:



In [ ]:

    
c.ids



In [ ]:

    
dview = c[:]

activate - make magics use the curent view



In [ ]:

    
%%px
%matplotlib inline
from pylab import *
plot([1,2,9],[2,9,3],'o-')



In [ ]:

    
%pxresult



In [ ]:



In [ ]:

@dview.remote



In [ ]:

    
@dview.remote(block=True)
def getpid():
    import os
    return os.getpid()



In [ ]:

    
getpid()



In [ ]:

    
@dview.remote(block=True)
def iterate_logistic(a,N,Niter):
    import numpy as np 
    x = np.random.random(N)
    for i in range(Niter):
        x = a*x*(1.0-x)
    return x



In [ ]:



In [ ]:

    
iterate_logistic(4.0,1,10000)



In [ ]:

@dview.parallel



In [ ]:

    
import numpy as np
x = np.random.random((2**12))
print(x.nbytes/1024**2,x.size)
a = np.ones_like(x)*4.0



In [ ]:

    
@dview.parallel(block=True)
def iterate_logistic(x,a):
    import numpy as np 
    x = np.copy(x)
    for i in range(100000):
        x = a*x*(1.0-x)
    return x



In [ ]:

    
%%time
for i in range(100000):
    x = a*x*(1.0-x)



In [ ]:

    
%time x = iterate_logistic(x,a)



In [ ]:



In [ ]:



In [ ]:

    
dview?



In [ ]:

    
import ipyparallel.client.remotefunction
ipyparallel.client.remotefunction.ParallelFunction?



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Configure mpi profile first:

http://ipyparallel.readthedocs.io/en/latest/process.html#parallel-process



In [ ]:



In [ ]:

    
c = ipp.Client(profile='mpi')



In [ ]:

    
c.ids



In [ ]:

    
c[:].apply_sync(lambda : "Hello, World")

Interesting might be:

c = Client('/path/to/my/ipcontroller-client.json', sshserver='me@myhub.example.com')



In [ ]:

    
import mpi4py



In [ ]:

    
%%px



In [ ]:

    
%%writefile psum.py
from mpi4py import MPI
import numpy as np

def psum(a):
    locsum = np.sum(a)
    rcvBuf = np.array(0.0,'d')
    MPI.COMM_WORLD.Allreduce([locsum, MPI.DOUBLE],
        [rcvBuf, MPI.DOUBLE],
        op=MPI.SUM)
    return rcvBuf



In [ ]:

    
view = c[:]



In [ ]:

    
view.activate()



In [ ]:

    
view.run('psum.py')



In [ ]:

    
view.scatter('a',np.arange(16,dtype='float'))



In [ ]:

    
view['a']



In [ ]:

    
%px totalsum = psum(a)



In [ ]:

    
view['totalsum']



In [ ]:



In [ ]:

    
%%writefile psum.py
from mpi4py import MPI
import numpy as np

import time 
def psum(a):
    locsum = np.sum(a)
    rcvBuf = np.array(0.0,'d')
    MPI.COMM_WORLD.Allreduce([locsum, MPI.DOUBLE],
        [rcvBuf, MPI.DOUBLE],
        op=MPI.SUM)
    return rcvBuf

rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
if rank==0:
    a = np.arange(16,dtype='float')
else:
    a = None
a_local = np.empty(16/size,dtype='float')   
MPI.COMM_WORLD.Scatter( a, a_local )
time.sleep(rank*0.1)
print(rank,":::",a_local)

totalsum = psum(a_local)
print(totalsum)



In [ ]:

    
!mpirun -n 4 /opt/conda/envs/py27/bin/python psum.py



In [ ]:



In [ ]: