(after the command: ipcluster start -n 4)


In [2]:
from IPython.parallel import  Client

In [2]:
rc = Client()

In [3]:
rc.ids


Out[3]:
[0, 1, 2, 3]


In [4]:
px import os

In [5]:
px print os.getpid()


[stdout:0] 2633
[stdout:1] 2634
[stdout:2] 2635
[stdout:3] 2636

In [6]:
pxconfig --targets 1

In [7]:
px print os.getpid()


2634

In [8]:
%%px --targets 2
print os.getpid()


2635

In [9]:
pxconfig --targets all

In [10]:
px print os.getpid()


[stdout:0] 2633
[stdout:1] 2634
[stdout:2] 2635
[stdout:3] 2636


In [11]:
%%px --noblock
import time
time.sleep(1)
os.getpid()


Out[11]:
<AsyncResult: execute>

In [12]:
%pxresult


Out[0:21]: 2633
Out[1:24]: 2634
Out[2:22]: 2635
Out[3:18]: 2636

Parallel map


In [13]:
v = rc[:] # get a view to the engines

In [14]:
with v.sync_imports():
    import time


importing time on engine(s)

In [15]:
def f(x):
    time.sleep(1)
    return x**2

In [16]:
v.map_sync(f, range(10))


Out[16]:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [17]:
timeit -n 1 -r 1 v.map_sync(f, range(10))


1 loops, best of 1: 3.01 s per loop

In [18]:
timeit -n 1 -r 1 map(f, range(10))


1 loops, best of 1: 10 s per loop

Asychronous map:


In [19]:
r = v.map(f, range(10))

In [21]:
r.ready(), r.elapsed


Out[21]:
(True, 3.484876)

In [59]:
r.get()


Out[59]:
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [60]:
r.elapsed, r.serial_time


Out[60]:
(4.636209, 10.011764)

Monte Carlo Simulation


In [11]:
def sample(n):
    return 4*(rand(n)**2 + rand(n)**2 <= 1).sum()/float(n)

In [52]:
n = 1000000

In [53]:
timeit -r 10 -n 1 sample(n)


1 loops, best of 10: 31.7 ms per loop

In [14]:
from IPython.parallel import Client
rc = Client()
v = rc[:]

In [15]:
with v.sync_imports():
    from numpy.random import rand
    import numpy


importing rand from numpy.random on engine(s)
importing numpy on engine(s)

In [19]:
timeit -r 10 -n 1 sum(v.map_sync(sample, [n]*4)) / 4 # len(v) = 4


1 loops, best of 10: 459 ms per loop

In [165]:
(rand(5) <= 0.5)


Out[165]:
array([ True, False,  True, False,  True], dtype=bool)

In [65]:
(rand(4)**2 + rand(4)**2 <= 1)


Out[65]:
array([ True, False,  True,  True], dtype=bool)


In [128]:
%load_ext cythonmagic

Pure Python (to compare to Cython, and Cython+NumPy)


In [182]:
import random

In [157]:
def mcpy1(n):
    dfg = 0
    for k in xrange(n):
        dfg += (random.random()**2 + random.random()**2 <= 1)
    return 4*dfg/float(n)

In [158]:
n = 1000000
# r1 = [random.random() for k in range(n)]

In [160]:
timeit -r 10 -n 10 mcpy1(n)


10 loops, best of 10: 380 ms per loop

Cython


In [1]:
%load_ext cythonmagic

In [29]:
%%cython
from libc.stdlib cimport rand, RAND_MAX
def mcpy2(int n):
    cdef int dfg = 0
    cdef int k = 0
    for k in xrange(n):
        dfg += ((float(rand())/RAND_MAX)**2 + (float(rand())/RAND_MAX)**2 <= 1)
    return 4*dfg/float(n)

In [30]:
n = 1000000

In [31]:
timeit -r 10 -n 10 mcpy2(n)


10 loops, best of 10: 13.3 ms per loop

In [47]:
mcpy2(n)


Out[47]:
3.141796

Cython+NumPy (not so different from pure NumPy in Monte Carlo above, because NumPy types aren't defined in C. See few examples below on how to do it.)


In [171]:
%%cython
import random
import numpy as np
def mcpy3(int n):
    return 4*np.sum(np.random.rand(n)**2 + np.random.rand(n)**2 <= 1)/float(n)

In [172]:
n = 1000000

In [173]:
timeit -r 10 -n 1 mcpy3(n)


1 loops, best of 10: 31.6 ms per loop


Random numbers may be misleading, try another algorithm: sum all the squares of numbers till 1 000 000

Pure Python:


In [101]:
n = 1000000

In [126]:
def sumProc01():
    return sum(map(lambda x: x**2, xrange(1000000)))

In [127]:
timeit -r 10 -n 10 sumProc01()


10 loops, best of 10: 251 ms per loop

In [31]:
print sumProc01()


14

Cython:


In [199]:
n = 1000000

In [180]:
%%cython
def sumProc02():
    return sum(map(lambda x: x**2, xrange(1000000)))

In [181]:
timeit -r 10 -n 10 sumProc02()


10 loops, best of 10: 90.7 ms per loop


In [106]:
def bla():
    out = []
    for k in xrange(1000000):
        out.append(k**2)
    return sum(out)

In [113]:
timeit -r 10 -n 10 bla()


10 loops, best of 10: 227 ms per loop

In [193]:
%%cython
def bla2():
    out = []
    cdef int k = 0
    for k in xrange(1000000):
        out.append(k**2)
    return sum(out)

In [194]:
timeit -r 10 -n 10 bla2()


10 loops, best of 10: 26.8 ms per loop

Basic pure Python vs. NumPy vs. Cython comparison:


In [24]:
def bla3():
    out = 0
    for k in xrange(1000000):
        out += k**2
    return out

In [25]:
timeit -r 10 -n 10 bla3()


10 loops, best of 10: 85.8 ms per loop

In [20]:
def bla5():
    return (arange(1000000)**2).sum()

In [21]:
timeit -r 10 -n 10 bla5()


10 loops, best of 10: 4.34 ms per loop

In [66]:
%%cython
def bla4():
    cdef int out = 0
    cdef int k = 0
    for k in xrange(1000000):
        out += k**2
    return out

In [67]:
timeit -r 10 -n 10 bla4()


10 loops, best of 10: 607 us per loop

Does (x)range add any overhead? Compare with the above bla4():


In [50]:
%%cython
def blanext():
    cdef int out = 0
    cdef int k = 0
    while k < 1000000:
        out += k**2
        k += 1
    return out

In [54]:
timeit -r 10 -n 10 blanext()


10 loops, best of 10: 615 us per loop

Sieve


In [142]:
def primes1(n):
    primes = [False, False] + [True] * (n-2)
    i = 2
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]

In [144]:
primes1(20)


Out[144]:
[2, 3, 5, 7, 11, 13, 17, 19]

In [145]:
m = 10000

In [148]:
timeit -n 100 -r 3 primes1(m)


100 loops, best of 3: 3.28 ms per loop

Naive:


In [149]:
%load_ext cythonmagic

In [150]:
%%cython
def primes2(n):
    primes = [False, False] + [True] * (n-2)
    i = 2
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]

In [151]:
timeit -n 100 -r 3 primes2(m)


100 loops, best of 3: 1.42 ms per loop

With C types:


In [152]:
%%cython
def primes3(int n):
    primes = [False, False] + [True] * (n-2)
    cdef int i = 2
    cdef int k = 0
    while i < n:
        if not primes[i]:
            i += 1
            continue
        k = i * i
        while k < n:
            primes[k] = False
            k += i
        i += 1
    return [i for i in xrange(2,n) if primes[i]]

In [153]:
timeit -n 100 -r 3 primes3(m)


100 loops, best of 3: 264 us per loop

Using NumPy and Cython


In [1]:
%load_ext cythonmagic

In [7]:
def step1():
    return sign(rand(1) - .5)

def sim1(n):
    x = zeros(n)
    dx = 1./n
    for i in xrange(n - 1):
        x[i+1] = x[i] + dx * step1()
    return x

In [9]:
plot(sim1(10000))


Out[9]:
[<matplotlib.lines.Line2D at 0x10830bf90>]

In [27]:
m = 10000

In [10]:
timeit sim1(m)


10 loops, best of 3: 112 ms per loop

Naive Cython + NumPy:


In [16]:
%%cython
import numpy as np
cdef int step2():
    return np.sign(np.random.rand(1) - .5)

def sim2(int n):
    x = np.zeros(n)
    cdef double dx = 1./n
    cdef int i
    for i in xrange(n - 1):
        x[i+1] = x[i] + dx * step2()
    return x

In [17]:
timeit sim2(m)


10 loops, best of 3: 69.4 ms per loop

Good Cython from the book:


In [26]:
%%cython
import numpy as np
cimport numpy as np

DTYPE = np.double
ctypedef np.double_t DTYPE_t

from libc.stdlib cimport rand, RAND_MAX
from libc.math cimport round

cdef double step3():
    return 2 * round(float(rand()) / RAND_MAX) - 1

def sim3(int n):
    cdef int i
    cdef double dx = 1./n
    cdef np.ndarray[DTYPE_t, ndim=1] x = np.zeros(n, dtype=DTYPE)
    for i in xrange(n-1):
        x[i+1] = x[i] + dx - step3()
    return x

In [28]:
timeit sim3(m)


10000 loops, best of 3: 111 us per loop


In [40]:
%%cython
from libc.stdlib cimport rand, RAND_MAX
def bla():   
    print float(rand())/RAND_MAX

In [43]:
bla()


0.67495010033


In [ ]: