Efficient programming for parallel computing

Timing and profiling: timeit and line_profiler

Timing your code



In [1]:

    
%%file timing.py
"""
some simple things to time
"""
import time


def _list_comprehension(N):
    return [x*x for x in xrange(N)]


def _for_append(N):
    L = []
    for x in xrange(N):
        L.append(x*x)
    return L


def _for_setitem(N):
    L = [None]*N
    i = 0
    for x in xrange(N):
        L[i] = x*x
        i += 1
    return L


def timed(f):
    def dec(*args, **kwds):
        start = time.time()
        res = f(*args, **kwds) 
        dec.__time__[f.__name__] = time.time() - start
        return res
    def get_time():
        return dec.__time__.values()[0]
    dec.__time__ = {}
    dec.timed = get_time
    return dec


def compare(f1, f2, N, M=1000):
    t1 = 0; t2 = 0
    for i in xrange(M):
        f1(N)
        t1 += f1.timed()
    for i in xrange(M):
        f2(N)
        t2 += f2.timed()
    print "ratio: %s" % (t1/t2)



if __name__ == '__main__':
    N = 10000
    
    print("size = %s" % N)
    start = time.time()
    _list_comprehension(N)
    end = time.time() - start
    print("%s: list comp" % end)
    
    start = time.time()
    _for_append(N)
    end = time.time() - start
    print("%s: for append" % end)
    
    start = time.time()
    _for_setitem(N)
    end = time.time() - start
    print("%s: for setitem" % end)


# EOF









    



Overwriting timing.py



In [2]:

    
!python2.7 timing.py









    



size = 10000
0.000709056854248: list comp
0.00122308731079: for append
0.00180315971375: for setitem



In [3]:

    
import timing

@timing.timed
def sum_squared(x):
    return sum(i*i for i in x)

print "result: %s" % sum_squared(xrange(50))
print "time: %s" % sum_squared.timed()









    



result: 40425
time: 1.09672546387e-05



In [4]:

    
def sum_squared(x):
    return sum(i*i for i in x)

%timeit sum_squared(xrange(50))









    



100000 loops, best of 3: 5.17 µs per loop

Profiling your code



In [5]:

    
%%file profiling.py
"""some simple things to profile
"""
    
GLOBAL = 1


def _repeat(counter):
    """Using the GLOBAL value directly.
    """
    for count in xrange(counter):
        GLOBAL


def _repeat_local(counter):
    """Making GLOBAL a local variable.
    """
    local = GLOBAL
    for count in xrange(counter):
        local


def _repeat_2(counter):
    """Using the built-in `True` in a loop.
    """
    for count in xrange(counter):
        True


def _repeat_local_2(counter):
    """Making `True` a local variable.
    """
    true = True
    for count in xrange(counter):
        true

        
def _test(counter):
    """Call all functions.
    """
    _repeat(counter)
    _repeat_local(counter)
    _repeat_2(counter)
    _repeat_local_2(counter)


def profile(code_string):
    """Check the run times.
    """
    import cProfile
    profiler = cProfile.Profile()
    profiler.run(code_string)
    profiler.print_stats()   
    

    
if __name__ == '__main__':
    profile('_test(int(1e8))')


# EOF









    



Overwriting profiling.py



In [19]:

    
!python profiling.py









    



         7 function calls in 13.129 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   13.129   13.129 <string>:1(<module>)
        1    2.896    2.896    2.896    2.896 profiling.py:14(_repeat_local)
        1    3.873    3.873    3.873    3.873 profiling.py:22(_repeat_2)
        1    2.902    2.902    2.902    2.902 profiling.py:29(_repeat_local_2)
        1    0.000    0.000   13.129   13.129 profiling.py:37(_test)
        1    3.458    3.458    3.458    3.458 profiling.py:7(_repeat)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



In [6]:

    
%%file http_search.py
"""http pattern search
"""

import re

PATTERN = r"https?:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?"

@profile
def scan_for_http(f):
    addresses = []
    for line in f:
        result = re.search(PATTERN, line)
        if result:
            addresses.append(result.group(0))
    return addresses


if __name__ == "__main__":
    import sys
    f = open(sys.argv[1], 'r')
    addresses = scan_for_http(f)
    for address in addresses:
        pass
       #print(address)


# EOF









    



Overwriting http_search.py



In [37]:

    
%%bash
kernprof -lv http_search.py sample.html









    



Wrote profile results to http_search.py.lprof
Timer unit: 1e-06 s

Total time: 0.009604 s
File: http_search.py
Function: scan_for_http at line 8

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     8                                           @profile
     9                                           def scan_for_http(f):
    10         1            2      2.0      0.0      addresses = []
    11      1350          778      0.6      8.1      for line in f:
    12      1349         8242      6.1     85.8          result = re.search(PATTERN, line)
    13      1349          543      0.4      5.7          if result:
    14        39           39      1.0      0.4              addresses.append(result.group(0))
    15         1            0      0.0      0.0      return addresses



In [7]:

    
%%file http_search.py
"""http pattern search
"""

import re

PATTERN = r"https?:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?"

@profile
def scan_for_http(f):
    addresses = []
    pat = re.compile(PATTERN) # <-- NOTE
    for line in f:
        result = pat.search(line)
        if result:
            addresses.append(result.group(0))
    return addresses


if __name__ == "__main__":
    import sys
    f = open(sys.argv[1], 'r')
    addresses = scan_for_http(f)
    for address in addresses:
        pass
       #print(address)


# EOF









    



Overwriting http_search.py



In [39]:

    
%%bash
kernprof -lv http_search.py sample.html









    



Wrote profile results to http_search.py.lprof
Timer unit: 1e-06 s

Total time: 0.004025 s
File: http_search.py
Function: scan_for_http at line 8

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     8                                           @profile
     9                                           def scan_for_http(f):
    10         1            2      2.0      0.0      addresses = []
    11         1         1667   1667.0     41.4      pat = re.compile(PATTERN) # <-- NOTE
    12      1350          705      0.5     17.5      for line in f:
    13      1349         1150      0.9     28.6          result = pat.search(line)
    14      1349          466      0.3     11.6          if result:
    15        39           34      0.9      0.8              addresses.append(result.group(0))
    16         1            1      1.0      0.0      return addresses

Not covered: memory profiling. See guppy and pympler

Also see: http://pynash.org/2013/03/06/timing-and-profiling.html

Efficiency in language patterns

Global versus local

Staying native



In [15]:

    
import timing

@timing.timed
def use_ON(iterable):
    result = []
    for item in iterable:
        result.insert(0, item)
    return result


@timing.timed
def use_O1(iterable):
    result = []
    for item in iterable:
        result.append(item)
    result.reverse()
    return result


@timing.timed
def use_list(iterable):
    result = list(iterable)
    result.reverse()
    return result


def compare_ON_O1(N):
    r1 = use_ON(range(N))
    r2 = use_O1(range(N))
    print use_ON.timed() / use_O1.timed()

    
def compare_O1_list(N):
    r1 = use_list(range(N))
    r2 = use_O1(range(N))
    print use_list.timed() / use_O1.timed()

    
for i in [100,1000,10000]:
    print "for %s, ON:O1 =" % i,
    compare_ON_O1(i)
    print "for %s, O1:list =" % i,
    compare_O1_list(i)









    



for 100, ON:O1 = 2.01408450704
for 100, O1:list = 0.0597014925373
for 1000, ON:O1 = 4.36685288641
for 1000, O1:list = 0.0243902439024
for 10000, ON:O1 = 23.2021516571
for 10000, O1:list = 0.022230965965

Pointers versus copies



In [16]:

    
import timing

@timing.timed
def double_extend(N):
    x = range(N)
    x.extend(x)
    return x

@timing.timed
def double_concatenate(N):
    x = range(N)
    return x+x

for i in [100,1000,10000]:
    print "N=%s" % i,
    timing.compare(double_extend, double_concatenate, N=i)









    



N=100 ratio: 0.911269430052
N=1000 ratio: 0.752598695878
N=10000 ratio: 0.66382711874

Choosing the right container



In [17]:

    
import timing

@timing.timed
def search_list(N):
    x = list(xrange(N))
    return N in x

@timing.timed
def search_set(N):
    x = set(xrange(N))
    return N in x


for j in [10,100,1000]:
    for i in [1000,10000,100000]:
        print "M=%s, N=%s" % (j, i),
        timing.compare(search_list, search_set, N=i, M=j)









    



M=10, N=1000 ratio: 0.924845269673
M=10, N=10000 ratio: 0.821234487793
M=10, N=100000 ratio: 0.614927891245
M=100, N=1000 ratio: 1.07002729444
M=100, N=10000 ratio: 0.959192717797
M=100, N=100000 ratio: 0.739521890806
M=1000, N=1000 ratio: 0.998064452741
M=1000, N=10000 ratio: 0.893941558453
M=1000, N=100000 ratio: 0.724139481948



In [17]:

    
N = 10000
x = set(xrange(N))
%timeit N in x
x = list(xrange(N))
%timeit N in x









    



The slowest run took 29.05 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 73.9 ns per loop
10000 loops, best of 3: 144 µs per loop

Looping constructs

EXERCISE: Write test code that calculates the sum of all squares of the numbers from zero to one million. Use a for loop that directly adds with +=, a list comprehension, and also generator comprehension. Try it with range and xrange. Use different numbers, e.g. smaller and larger than one million. How do they compare?



In [24]:

    
%%file looping.py
"""test some looping constructs
"""

def generator(N):
    return sum(i*i for i in xrange(N))

def list_comp(N):
    return sum([i*i for i in xrange(N)])

def for_loop(N):
    sum = 0
    for i in xrange(N):
        sum += i*i
    return sum


for N in [100,1000,10000,1000000]:
    print "N = %s" % N
    %timeit generator(N)
    %timeit list_comp(N)
    %timeit for_loop(N)









    



Overwriting looping.py



In [23]:

    
# %load looping.py
"""test some looping constructs
"""

def generator(N):
    return sum(i*i for i in xrange(N))

def list_comp(N):
    return sum([i*i for i in xrange(N)])

def for_loop(N):
    sum = 0
    for i in xrange(N):
        sum += i*i
    return sum


for N in [100,1000,10000,1000000]:
    print "N = %s" % N
    %timeit generator(N)
    %timeit list_comp(N)
    %timeit for_loop(N)









    



N = 100
100000 loops, best of 3: 9.61 µs per loop
100000 loops, best of 3: 8.52 µs per loop
100000 loops, best of 3: 7.14 µs per loop
N = 1000
10000 loops, best of 3: 90.5 µs per loop
10000 loops, best of 3: 66.6 µs per loop
10000 loops, best of 3: 81.8 µs per loop
N = 10000
1000 loops, best of 3: 897 µs per loop
1000 loops, best of 3: 635 µs per loop
1000 loops, best of 3: 843 µs per loop
N = 1000000
10 loops, best of 3: 89.6 ms per loop
10 loops, best of 3: 71.8 ms per loop
10 loops, best of 3: 85 ms per loop

Vector programming: numpy

numpy basics



In [39]:

    
import numpy as np
a = np.array([1,2,3,4])
b = np.array([5,6,7,8])

print a+b
print a*b
print a**2 - 2*b + 1









    



[ 6  8 10 12]
[ 5 12 21 32]
[-8 -7 -4  1]



In [44]:

    
print np.sin(a)
print np.max(a)









    



[ 0.84147098  0.90929743  0.14112001 -0.7568025 ]
4



In [49]:

    
print np.hstack((a,b))
print np.add(a,b)
print np.add.accumulate(a)









    



[1 2 3 4 5 6 7 8]
[ 6  8 10 12]
[ 1  3  6 10]



In [54]:

    
c = np.arange(0,8,2)
print c
d = np.empty(c.shape, dtype=int)
for i,j in enumerate(c):
    d[i] = j**2
    print d[:i+1]









    



[0 2 4 6]
[0]
[0 4]
[ 0  4 16]
[ 0  4 16 36]



In [62]:

    
e = np.vstack((a,b))
print e

Pointers versus copies



In [64]:

    
print e.shape
e.shape = (-1,8)
print e
c = e.reshape((4,2))
print c









    



(2, 4)
[[1 2 3 4 5 6 7 8]]
[[1 2]
 [3 4]
 [5 6]
 [7 8]]



In [66]:

    
d = c.T
print d
b = d.flatten()
print b









    



[[-1  3  5  7]
 [ 2  4  6  8]]
[-1  3  5  7  2  4  6  8]



In [67]:

    
c[0,0] = -1
b[-1] = 10
print d









    



[[-1  3  5  7]
 [ 2  4  6  8]]

slice versus where



In [71]:

    
d[0,:2] = [11,13]
c[2:,0] = [15,17]
print d









    



[[11 13 15 17]
 [ 2  4  6  8]]



In [72]:

    
b = d[0,::2]
print b
np.add(b,-10,b)
print b
print d









    



[11 15]
[1 5]
[[ 1 13  5 17]
 [ 2  4  6  8]]



In [77]:

    
x = np.arange(1e8)
%timeit x.T
%timeit y = x[1::2]
%timeit np.add(c,d.T)









    



The slowest run took 25.41 times longer than the fastest. This could mean that an intermediate result is being cached 
10000000 loops, best of 3: 159 ns per loop
The slowest run took 15.04 times longer than the fastest. This could mean that an intermediate result is being cached 
1000000 loops, best of 3: 270 ns per loop
The slowest run took 18.78 times longer than the fastest. This could mean that an intermediate result is being cached 
1000000 loops, best of 3: 698 ns per loop



In [113]:

    
x = np.linspace(0,16,8, dtype=int)
print x
print np.where(x >= 10)
print x[np.where(x >= 10)]
print x[x >= 10]









    



[ 0  2  4  6  9 11 13 16]
(array([5, 6, 7]),)
[11 13 16]
[11 13 16]



In [114]:

    
x[x % 2 != 0] = x[x % 2 != 0] - 1
print x









    



[ 0  2  4  6  8 10 12 16]

Vector programming versus looping



In [115]:

    
def _sinc(x):
    if x == 0.0:
        return 1.0
    return np.sin(x)/x

sinc = np.vectorize(_sinc)  # could use as a decorator

print sinc(d)









    



[[ 0.84147098  0.03232054 -0.19178485 -0.05655279]
 [ 0.45464871 -0.18920062 -0.04656925  0.12366978]]



In [116]:

    
%timeit map(_sinc, x)
%timeit sinc(x)
%timeit np.sinc(x)









    



10000 loops, best of 3: 34.6 µs per loop
10000 loops, best of 3: 43.1 µs per loop
100000 loops, best of 3: 10.3 µs per loop

EXERCISE: Profile the functions in roll.py, strategy.py, and trials.py. Where are the hot spots? Can you make any significant improvements? Try converting to vectorized versions of the code in those three files where it seems appropriate. Can you make any gains? (Don't touch the code in optimize.py for now.) Note that some functions don't vectorize well, so make sure to retain the original verions of the code -- especially since some other techniques for speeding up code may prefer non-vector versions.

See: 'solution'

Programming efficency: testing and error handling

Functional programming as a gateway to parallel



In [118]:

    
x = range(11)
y = range(-5,6)

def add(x,y):
    return x+y

def squared(x):
    return x*x 

print [squared(i) for i in (add(j,k) for j,k in zip(x,y))]
print map(squared, map(add, x, y))

%timeit [squared(i) for i in (add(j,k) for j,k in zip(x,y))]
%timeit map(squared, map(add, x, y))









    



[25, 9, 1, 1, 9, 25, 49, 81, 121, 169, 225]
[25, 9, 1, 1, 9, 25, 49, 81, 121, 169, 225]
100000 loops, best of 3: 5.94 µs per loop
100000 loops, best of 3: 3.91 µs per loop



In [119]:

    
from multiprocessing.dummy import Pool
tmap = Pool().map
%timeit map(squared, range(10))
%timeit tmap(squared, range(10))









    



100000 loops, best of 3: 2.33 µs per loop
1000 loops, best of 3: 855 µs per loop



In [120]:

    
def sleepy_squared(x):
    import time
    time.sleep(.1)
    return x*x

%timeit map(sleepy_squared, range(10))
%timeit tmap(sleepy_squared, range(10))









    



1 loops, best of 3: 1.01 s per loop
1 loops, best of 3: 202 ms per loop

Not covered: nose and saving your validation code



In [121]:

    
%%file test_squared_map.py
"""sanity check for our parallel maps
"""
from multiprocessing.dummy import Pool
tmap = Pool().map

x = range(11)

def squared(x):
    return x*x

def test_squared():
    assert map(squared, x) == tmap(squared, x)


# EOF









    



Writing test_squared_map.py



In [122]:

    
!nosetests test_squared_map.py









    



.
----------------------------------------------------------------------
Ran 1 test in 0.001s

OK

See: https://nose.readthedocs.org/en/latest/

Errors and error handling



In [127]:

    
def bad(x):
    import sleep
    sleep(x)
    return x

print map(bad, range(2))









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-127-6c3719217f06> in <module>()
      4     return x
      5 
----> 6 print map(bad, range(2))

<ipython-input-127-6c3719217f06> in bad(x)
      1 def bad(x):
----> 2     import sleep
      3     sleep(x)
      4     return x
      5 

ImportError: No module named sleep



In [128]:

    
def less_bad(x):
    try:
        import sleep
    except ImportError:
        return None
    sleep(x)
    return x

map(less_bad, range(2))









    Out[128]:





[None, None]

Sanity checking with pyflakes and pep8



In [126]:

    
%%bash
pyflakes-2.7 test_squared_map.py
pep8-2.7 test_squared_map.py









    



test_squared_map.py:8:1: E302 expected 2 blank lines, found 1
test_squared_map.py:11:1: E302 expected 2 blank lines, found 1
test_squared_map.py:15:6: W292 no newline at end of file

Serialization and code encapsulation



In [135]:

    
%%file state.py
"""some good state utilities
"""

def check_pickle(x):
    "checks the pickle across a subprocess"
    import pickle
    import subprocess
    fail = True
    try:
        _x = pickle.dumps(x)
        fail = False
    finally:
        if fail:
            print "DUMP FAILED"
    msg = "python -c import pickle; print pickle.loads(%s)" % repr(_x)
    print "SUCCESS" if not subprocess.call(msg.split(None,2)) else "LOAD FAILED"


# EOF









    



Writing state.py



In [136]:

    
import pickle
print pickle.dumps([1,2,3])









    



(lp0
I1
aI2
aI3
a.



In [137]:

    
print pickle.dumps(squared)









    



c__main__
squared
p0
.



In [138]:

    
import state
state.check_pickle([1,2,3])









    



SUCCESS



In [139]:

    
state.check_pickle(squared)









    



LOAD FAILED



In [140]:

    
%%file sleepy.py
'''test file for pickling 
'''
def sleepy_squared(x):
    import time
    time.sleep(.1)
    return x*x









    



Writing sleepy.py



In [141]:

    
import sleepy
state.check_pickle(sleepy.sleepy_squared)









    



SUCCESS

Let's look at going parallel with multiprocessing...