We compare three implementations of the same Algorithm: calculating $\pi$ by Monte Carlo method

  • Serial
  • multiprocessing
  • ipyparallel

Serial solution


In [1]:
from random import uniform
from time import time

In [2]:
def sample_circle(n):
    """
    throw n darts in [0, 1] * [0, 1] square, return the number
    of darts inside unit circle.
    
    Parameter
    ---------
        n: number of darts to throw.
    
    Return
    ------
        m: number of darts inside unit circle.
    """
    m = 0
    for i in range(int(n)):
        x, y = uniform(0, 1), uniform(0, 1)
        if x**2 + y**2 < 1:
            m += 1
    return m

In [3]:
def pi_serial(n):
    """
    Naive serial implementation of monte carlo pi using 
        sample_circle function.
        
    Parameter
    ---------
        n: number of darts to throw
    
    Return
    ------
        value of pi in the monte-carlo simulation
    """
    t1 = time()
    m = sample_circle(n)
    t2 = time()
    t_loop = t2 - t1
    
    pi_approx = 4. * m / n
    return pi_approx, t_loop

In [4]:
n = 100000
%time pi_serial(n)


CPU times: user 137 ms, sys: 3.94 ms, total: 141 ms
Wall time: 140 ms
Out[4]:
(3.14236, 0.14027786254882812)

Multiprocessing implementation


In [5]:
from random import uniform
from multiprocessing import Pool 
from time import time

In [6]:
def sample_circle(n):
    """
    throw n darts in [0, 1] * [0, 1] square, return the number
    of darts inside unit circle.
    
    Parameter
    ---------
        n: number of darts to throw.
    
    Return
    ------
        m: number of darts inside unit circle.
    """
    m = 0
    for i in range(int(n)):
        x, y = uniform(0, 1), uniform(0, 1)
        if x**2 + y**2 < 1:
            m += 1
    return m

In [7]:
def pi_mp(num_darts, num_proc=None, msg = False):
    """
    Calculate pi using multiprocessing
    
    Parameter
    ---------
    num_darts: total number of darts to throw to calculate pi
    num_proc: number of processors/ workers to assign, default
            value = os.cpu_count()
    
    Return
    ------
        pi_approx: approximated value of pi
        t_loop: time spent in monte carlo simulation in seconds.
            initializing and shutting down worker pool have been
            excluded in timing. 
    """
    # default number processes = num of CPUs
    if not num_proc:
        import os
        num_proc = os.cpu_count()    
    
    t1 = time()
    
    # average number of darts that each processor process
    
    avg_load = num_darts // num_proc
    extra_load = num_darts % avg_load
    
    # initialize workload for processors
    
    loads = [avg_load] * num_proc
    loads[num_proc - 1] += extra_load
    
    # start a pool of workers
    pool = Pool(processes=num_proc)

    t2 = time()
    
    # assign jobs for each worker
    result = pool.imap_unordered(sample_circle, loads)
    # combine results from all workers
    num_in_cirlce = sum(result)
    
    t3 = time()
    
    # shut down pool, remove pointer to pool object
    # allowing garbage collectors release memory
    pool.terminate()
    del pool
    
    t4 = time()
    
    t_setup = t2-t1
    t_loop = t3-t2
    t_shutdown = t4-t3
    pi_approx = 4 *  num_in_cirlce / num_darts
    
    if msg:
        print("set up {0} workers used {1:.3g}s".format(num_proc, t_setup))
        print("throwing {0} darts used {1:.3g}s".format(num_darts, t_loop))
        print("terminate {0} workers used {1:.3g}s".format(num_proc, t_shutdown))
    return pi_approx, t_loop

In [8]:
num_darts = 100000
pi_mp(num_darts)


Out[8]:
(3.14116, 0.06301093101501465)

ipyparallel implementation

First start ipython cluster in terminal

$ipcluster start -n 4

In [9]:
def sample_circle(n):
    """
    throw n darts in [0, 1] * [0, 1] square, return the number
    of darts inside unit circle.
    
    Parameter
    ---------
        n: number of darts to throw.
    
    Return
    ------
        m: number of darts inside unit circle.
    """
    m = 0
    for i in range(int(n)):
        x, y = uniform(0, 1), uniform(0, 1)
        if x**2 + y**2 < 1:
            m += 1
    return m

In [10]:
from ipyparallel import Client
from time import time

In [11]:
def pi_ipp(num_darts, msg=False):
    """
    Calculate pi using ipyparallel module
    
    Parameter
    ---------
    num_darts: total number of darts to throw to calculate pi
    num_proc: number of processors/ workers to assign, default
            value = os.cpu_count()
    
    Return
    ------
        pi_approx: approximated value of pi
        t_loop: time spent in monte carlo simulation in seconds.
            initializing ipyparallel client has been
            excluded in timing.     
    """
    t1 = time()
        
    num_proc = len(clients.ids)
    avg_load = num_darts // num_proc
    extra_load = num_darts % avg_load
    
    # initialize workload for processors
    loads = [avg_load] * num_proc
    loads[num_proc - 1] += extra_load
        
    t2 = time()
    
    result = dview.map_async(sample_circle, loads)   
    approx_pi = 4 * sum(result) / num_darts
    t3 = time() 
    
    t_loop = t3 - t2
    t_setup = t2 - t1
    
    if msg:
        print("set up {0} ipyparallel engines used {1:.3g}s".format(
                num_proc, t_setup))
        print("throwing {0} darts used {1:.3g}s".format(num_darts, t_loop))
    return approx_pi, t_loop

In [12]:
clients = Client()
dview = clients.direct_view()
with dview.sync_imports():
    from random import uniform
n = 100000
pi_ipp(n)


importing uniform from random on engine(s)
Out[12]:
(3.14228, 0.0707390308380127)

Performance test


In [13]:
n_benchmark = [10, 30, 100, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5, 3e5, 1e6, 3e6, 1e7]

In [14]:
t_serial = [pi_serial(n)[1] for n in n_benchmark]

In [15]:
t_mp = [pi_mp(n)[1] for n in n_benchmark]

In [19]:
clients = Client()
dview = clients.direct_view()
with dview.sync_imports():
    from random import uniform
t_ipp = [pi_ipp(n)[1] for n in n_benchmark]


importing uniform from random on engine(s)

In [20]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [21]:
fs = 20
f1, ax1= plt.subplots(figsize = [8, 6])
ax1.plot(n_benchmark, t_ipp, label = 'IPcluster')
ax1.plot(n_benchmark, t_mp, label = 'Multiprocessing')
ax1.plot(n_benchmark, t_serial, label = 'serial')


ax1.set_yscale('log')
ax1.set_xscale('log')
plt.legend(loc = 'best')
ax1.set_xlabel('# of darts thrown', fontsize = fs)
ax1.set_ylabel('Execution time (s): solid', fontsize = fs)

ax2 = ax1.twinx()
n_bm_np = np.array(n_benchmark)
ax2.plot(n_benchmark, n_bm_np/np.array(t_ipp), '--')
ax2.plot(n_benchmark, n_bm_np/np.array(t_mp), '--')
ax2.plot(n_benchmark, n_bm_np/np.array(t_serial), '--')

ax2.set_yscale('log')
ax2.set_xscale('log')

ax2.set_ylabel('simulation rate (darts/s): dashed', fontsize = fs)
plt.show()



In [23]:
# f1.savefig('performance.png', dpi = 300)

Result

-- hardware spec: MacBook Pro 2.6 GHz Intel Core i5, 4 cores


In [ ]: