(after the command: ipcluster start -n 4)



In [2]:

    
from IPython.parallel import  Client



In [2]:

    
rc = Client()



In [3]:

    
rc.ids









    Out[3]:





[0, 1, 2, 3]



In [4]:

    
px import os



In [5]:

    
px print os.getpid()









    



[stdout:0] 2633
[stdout:1] 2634
[stdout:2] 2635
[stdout:3] 2636



In [6]:

    
pxconfig --targets 1



In [7]:

    
px print os.getpid()



In [8]:

    
%%px --targets 2
print os.getpid()



In [9]:

    
pxconfig --targets all



In [10]:

    
px print os.getpid()









    



[stdout:0] 2633
[stdout:1] 2634
[stdout:2] 2635
[stdout:3] 2636



In [11]:

    
%%px --noblock
import time
time.sleep(1)
os.getpid()









    Out[11]:





<AsyncResult: execute>



In [12]:

    
%pxresult









    





Out[0:21]: 2633






    





Out[1:24]: 2634






    





Out[2:22]: 2635






    





Out[3:18]: 2636

Parallel map



In [13]:

    
v = rc[:] # get a view to the engines



In [14]:

    
with v.sync_imports():
    import time









    



importing time on engine(s)



In [15]:

    
def f(x):
    time.sleep(1)
    return x**2



In [16]:

    
v.map_sync(f, range(10))









    Out[16]:





[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]



In [17]:

    
timeit -n 1 -r 1 v.map_sync(f, range(10))









    



1 loops, best of 1: 3.01 s per loop



In [18]:

    
timeit -n 1 -r 1 map(f, range(10))









    



1 loops, best of 1: 10 s per loop

Asychronous map:



In [19]:

    
r = v.map(f, range(10))



In [21]:

    
r.ready(), r.elapsed









    Out[21]:





(True, 3.484876)



In [59]:

    
r.get()









    Out[59]:





[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]



In [60]:

    
r.elapsed, r.serial_time









    Out[60]:





(4.636209, 10.011764)

Monte Carlo Simulation



In [11]:

    
def sample(n):
    return 4*(rand(n)**2 + rand(n)**2 <= 1).sum()/float(n)



In [52]:

    
n = 1000000



In [53]:

    
timeit -r 10 -n 1 sample(n)









    



1 loops, best of 10: 31.7 ms per loop



In [14]:

    
from IPython.parallel import Client
rc = Client()
v = rc[:]



In [15]:

    
with v.sync_imports():
    from numpy.random import rand
    import numpy









    



importing rand from numpy.random on engine(s)
importing numpy on engine(s)



In [19]:

    
timeit -r 10 -n 1 sum(v.map_sync(sample, [n]*4)) / 4 # len(v) = 4









    



1 loops, best of 10: 459 ms per loop



In [165]:

    
(rand(5) <= 0.5)









    Out[165]:





array([ True, False,  True, False,  True], dtype=bool)



In [65]:

    
(rand(4)**2 + rand(4)**2 <= 1)









    Out[65]:





array([ True, False,  True,  True], dtype=bool)



In [128]:

    
%load_ext cythonmagic

Pure Python (to compare to Cython, and Cython+NumPy)



In [182]:

    
import random



In [157]:

    
def mcpy1(n):
    dfg = 0
    for k in xrange(n):
        dfg += (random.random()**2 + random.random()**2 <= 1)
    return 4*dfg/float(n)



In [158]:

    
n = 1000000
# r1 = [random.random() for k in range(n)]



In [160]:

    
timeit -r 10 -n 10 mcpy1(n)









    



10 loops, best of 10: 380 ms per loop

Cython



In [1]:

    
%load_ext cythonmagic



In [29]:

    
%%cython
from libc.stdlib cimport rand, RAND_MAX
def mcpy2(int n):
    cdef int dfg = 0
    cdef int k = 0
    for k in xrange(n):
        dfg += ((float(rand())/RAND_MAX)**2 + (float(rand())/RAND_MAX)**2 <= 1)
    return 4*dfg/float(n)



In [30]:

    
n = 1000000



In [31]:

    
timeit -r 10 -n 10 mcpy2(n)









    



10 loops, best of 10: 13.3 ms per loop



In [47]:

    
mcpy2(n)









    Out[47]:





3.141796

Cython+NumPy (not so different from pure NumPy in Monte Carlo above, because NumPy types aren't defined in C. See few examples below on how to do it.)



In [171]:

    
%%cython
import random
import numpy as np
def mcpy3(int n):
    return 4*np.sum(np.random.rand(n)**2 + np.random.rand(n)**2 <= 1)/float(n)



In [172]:

    
n = 1000000



In [173]:

    
timeit -r 10 -n 1 mcpy3(n)









    



1 loops, best of 10: 31.6 ms per loop

Random numbers may be misleading, try another algorithm: sum all the squares of numbers till 1 000 000

Pure Python:



In [101]:

    
n = 1000000



In [126]:

    
def sumProc01():
    return sum(map(lambda x: x**2, xrange(1000000)))



In [127]:

    
timeit -r 10 -n 10 sumProc01()









    



10 loops, best of 10: 251 ms per loop



In [31]:

    
print sumProc01()

Cython:



In [199]:

    
n = 1000000



In [180]:

    
%%cython
def sumProc02():
    return sum(map(lambda x: x**2, xrange(1000000)))



In [181]:

    
timeit -r 10 -n 10 sumProc02()









    



10 loops, best of 10: 90.7 ms per loop



In [106]:

    
def bla():
    out = []
    for k in xrange(1000000):
        out.append(k**2)
    return sum(out)



In [113]:

    
timeit -r 10 -n 10 bla()









    



10 loops, best of 10: 227 ms per loop



In [193]:

    
%%cython
def bla2():
    out = []
    cdef int k = 0
    for k in xrange(1000000):
        out.append(k**2)
    return sum(out)



In [194]:

    
timeit -r 10 -n 10 bla2()









    



10 loops, best of 10: 26.8 ms per loop

Basic pure Python vs. NumPy vs. Cython comparison:



In [24]:

    
def bla3():
    out = 0
    for k in xrange(1000000):
        out += k**2
    return out



In [25]:

    
timeit -r 10 -n 10 bla3()









    



10 loops, best of 10: 85.8 ms per loop



In [20]:

    
def bla5():
    return (arange(1000000)**2).sum()



In [21]:

    
timeit -r 10 -n 10 bla5()









    



10 loops, best of 10: 4.34 ms per loop



In [66]:

    
%%cython
def bla4():
    cdef int out = 0
    cdef int k = 0
    for k in xrange(1000000):
        out += k**2
    return out



In [67]:

    
timeit -r 10 -n 10 bla4()









    



10 loops, best of 10: 607 us per loop

Does (x)range add any overhead? Compare with the above bla4():



In [50]:

    
%%cython
def blanext():
    cdef int out = 0
    cdef int k = 0
    while k < 1000000:
        out += k**2
        k += 1
    return out



In [54]:

    
timeit -r 10 -n 10 blanext()









    



10 loops, best of 10: 615 us per loop

Sieve



In [142]:

    
def primes1(n):
    primes = [False, False] + [True] * (n-2)
    i = 2
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]



In [144]:

    
primes1(20)









    Out[144]:





[2, 3, 5, 7, 11, 13, 17, 19]



In [145]:

    
m = 10000



In [148]:

    
timeit -n 100 -r 3 primes1(m)









    



100 loops, best of 3: 3.28 ms per loop

Naive:



In [149]:

    
%load_ext cythonmagic



In [150]:

    
%%cython
def primes2(n):
    primes = [False, False] + [True] * (n-2)
    i = 2
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]



In [151]:

    
timeit -n 100 -r 3 primes2(m)









    



100 loops, best of 3: 1.42 ms per loop

With C types:



In [152]:

    
%%cython
def primes3(int n):
    primes = [False, False] + [True] * (n-2)
    cdef int i = 2
    cdef int k = 0
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]



In [153]:

    
timeit -n 100 -r 3 primes3(m)









    



100 loops, best of 3: 264 us per loop

Using NumPy and Cython



In [1]:

    
%load_ext cythonmagic



In [7]:

    
def step1():
    return sign(rand(1) - .5)

def sim1(n):
    x = zeros(n)
    dx = 1./n
    for i in xrange(n - 1):
        x[i+1] = x[i] + dx * step1()
    return x



In [9]:

    
plot(sim1(10000))









    Out[9]:





[<matplotlib.lines.Line2D at 0x10830bf90>]



In [27]:

    
m = 10000



In [10]:

    
timeit sim1(m)









    



10 loops, best of 3: 112 ms per loop

Naive Cython + NumPy:



In [16]:

    
%%cython
import numpy as np
cdef int step2():
    return np.sign(np.random.rand(1) - .5)

def sim2(int n):
    x = np.zeros(n)
    cdef double dx = 1./n
    cdef int i
    for i in xrange(n - 1):
        x[i+1] = x[i] + dx * step2()
    return x



In [17]:

    
timeit sim2(m)









    



10 loops, best of 3: 69.4 ms per loop

Good Cython from the book:



In [26]:

    
%%cython
import numpy as np
cimport numpy as np

DTYPE = np.double
ctypedef np.double_t DTYPE_t

from libc.stdlib cimport rand, RAND_MAX
from libc.math cimport round

cdef double step3():
    return 2 * round(float(rand()) / RAND_MAX) - 1

def sim3(int n):
    cdef int i
    cdef double dx = 1./n
    cdef np.ndarray[DTYPE_t, ndim=1] x = np.zeros(n, dtype=DTYPE)
    for i in xrange(n-1):
        x[i+1] = x[i] + dx - step3()
    return x



In [28]:

    
timeit sim3(m)









    



10000 loops, best of 3: 111 us per loop



In [40]:

    
%%cython
from libc.stdlib cimport rand, RAND_MAX
def bla():   
    print float(rand())/RAND_MAX



In [43]:

    
bla()









    



0.67495010033



In [ ]: