In [45]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

Working with large data sets

Lazy evaluation, pure functions and higher order functions

Lazy and eager evaluation

A list comprehension is eager.


In [20]:
[x*x for x in range(3)]


Out[20]:
[0, 1, 4]

A generator expression is lazy.


In [21]:
(x*x for x in range(3))


Out[21]:
<generator object <genexpr> at 0x1138f11a8>

You can use generators as iterators.


In [22]:
g = (x*x for x in range(3))

In [23]:
next(g)


Out[23]:
0

In [24]:
next(g)


Out[24]:
1

In [25]:
next(g)


Out[25]:
4

In [26]:
next(g)


---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
<ipython-input-26-5f315c5de15b> in <module>()
----> 1 next(g)

StopIteration: 

A generator is single use.


In [ ]:
for i in g:
    print(i, end=", ")

In [28]:
g = (x*x for x in range(3))
for i in g:
    print(i, end=", ")

The list constructor forces evaluation of the generator.


In [30]:
list(x*x for x in range(3))


Out[30]:
[0, 1, 4]

An eager function.


In [3]:
def eager_updown(n):
    xs = []
    for i in range(n):
        xs.append(i)
    for i in range(n, -1, -1):
        xs.append(i)
    return xs

In [4]:
eager_updown(3)


Out[4]:
[0, 1, 2, 3, 2, 1, 0]

A lazy generator.


In [5]:
def lazy_updown(n):
    for i in range(n):
        yield i
    for i in range(n, -1, -1):
        yield i

In [6]:
lazy_updown(3)


Out[6]:
<generator object lazy_updown at 0x103a52af0>

In [7]:
list(lazy_updown(3))


Out[7]:
[0, 1, 2, 3, 2, 1, 0]

Pure and impure functions

A pure function is like a mathematical function. Given the same inputs, it always returns the same output, and has no side effects.


In [32]:
def pure(alist):
    return [x*x for x in alist]

An impure function has side effects.


In [39]:
def impure(alist):
    for i in range(len(alist)):
        alist[i] = alist[i]*alist[i]
    return alist

In [40]:
xs = [1,2,3]

In [41]:
ys = pure(xs)
print(xs, ys)


[1, 2, 3] [1, 4, 9]

In [42]:
ys = impure(xs)
print(xs, ys)


[1, 4, 9] [1, 4, 9]

Quiz

Say if the following functions are pure or impure.


In [57]:
def f1(n):
    return n//2 if n % 2==0 else n*3+1

In [58]:
def f2(n):
    return np.random.random(n)

In [59]:
def f3(n):
    n = 23
    return n

In [60]:
def f4(a, n=[]):
    n.append(a)
    return n

Higher order functions


In [61]:
list(map(f1, range(10)))


Out[61]:
[0, 4, 1, 10, 2, 16, 3, 22, 4, 28]

In [63]:
list(filter(lambda x: x % 2 == 0, range(10)))


Out[63]:
[0, 2, 4, 6, 8]

In [62]:
from functools import reduce

In [66]:
reduce(lambda x, y: x + y, range(10), 0)


Out[66]:
45

In [68]:
reduce(lambda x, y: x + y, [[1,2], [3,4], [5,6]], [])


Out[68]:
[1, 2, 3, 4, 5, 6]

Using the operator module

The operator module provides all the Python operators as functions.


In [69]:
import operator as op

In [71]:
reduce(op.mul, range(1, 6), 1)


Out[71]:
120

In [72]:
list(map(op.itemgetter(1), [[1,2,3],[4,5,6],[7,8,9]]))


Out[72]:
[2, 5, 8]

Using itertools


In [73]:
import itertools as it

In [76]:
list(it.combinations(range(1,6), 3))


Out[76]:
[(1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 3, 4),
 (1, 3, 5),
 (1, 4, 5),
 (2, 3, 4),
 (2, 3, 5),
 (2, 4, 5),
 (3, 4, 5)]

Generate all Boolean combinations


In [85]:
list(it.product([0,1], repeat=3))


Out[85]:
[(0, 0, 0),
 (0, 0, 1),
 (0, 1, 0),
 (0, 1, 1),
 (1, 0, 0),
 (1, 0, 1),
 (1, 1, 0),
 (1, 1, 1)]

In [78]:
list(it.starmap(op.add, zip(range(5), range(5))))


Out[78]:
[0, 2, 4, 6, 8]

In [79]:
list(it.takewhile(lambda x: x < 3, range(10)))


Out[79]:
[0, 1, 2]

In [100]:
data = sorted('the quick brown fox jumps over the lazy dog'.split(), key=len)
for k, g in it.groupby(data, key=len):
    print(k, list(g))


3 ['the', 'fox', 'the', 'dog']
4 ['over', 'lazy']
5 ['quick', 'brown', 'jumps']

Using toolz


In [101]:
import toolz as tz

In [104]:
list(tz.partition(3, range(10)))


Out[104]:
[(0, 1, 2), (3, 4, 5), (6, 7, 8)]

In [106]:
list(tz.partition(3, range(10), pad=None))


Out[106]:
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, None, None)]

In [151]:
n = 30
dna = ''.join(np.random.choice(list('ACTG'), n))
dna


Out[151]:
'AAACTTCGGAGAGACGGGTTAACCGTACGC'

In [152]:
tz.frequencies(tz.sliding_window(2, dna))


Out[152]:
{('A', 'A'): 3,
 ('A', 'C'): 4,
 ('A', 'G'): 2,
 ('C', 'C'): 1,
 ('C', 'G'): 4,
 ('C', 'T'): 1,
 ('G', 'A'): 3,
 ('G', 'C'): 1,
 ('G', 'G'): 3,
 ('G', 'T'): 2,
 ('T', 'A'): 2,
 ('T', 'C'): 1,
 ('T', 'T'): 2}

Using pipes and the curried namespace


In [153]:
from toolz import curried as c

In [157]:
tz.pipe(
    dna,    
    c.sliding_window(2), # using curry
    c.frequencies,
)


Out[157]:
{('A', 'A'): 3,
 ('A', 'C'): 4,
 ('A', 'G'): 2,
 ('C', 'C'): 1,
 ('C', 'G'): 4,
 ('C', 'T'): 1,
 ('G', 'A'): 3,
 ('G', 'C'): 1,
 ('G', 'G'): 3,
 ('G', 'T'): 2,
 ('T', 'A'): 2,
 ('T', 'C'): 1,
 ('T', 'T'): 2}

In [164]:
composed = tz.compose(
    c.frequencies,
    c.sliding_window(2),  
)

In [165]:
composed(dna)


Out[165]:
{('A', 'A'): 3,
 ('A', 'C'): 4,
 ('A', 'G'): 2,
 ('C', 'C'): 1,
 ('C', 'G'): 4,
 ('C', 'T'): 1,
 ('G', 'A'): 3,
 ('G', 'C'): 1,
 ('G', 'G'): 3,
 ('G', 'T'): 2,
 ('T', 'A'): 2,
 ('T', 'C'): 1,
 ('T', 'T'): 2}

Processing many sets of DNA strings without reading into memory


In [184]:
m = 10000
n = 300
dnas = (''.join(np.random.choice(list('ACTG'), n, p=[.1, .2, .3, .4])) 
        for i in range(m))
dnas


Out[184]:
<generator object <genexpr> at 0x113a60f10>

In [185]:
tz.merge_with(sum, 
              tz.map(
                  composed,
                  dnas
              )
             )


Out[185]:
{('A', 'A'): 29999,
 ('A', 'C'): 59527,
 ('A', 'G'): 119450,
 ('A', 'T'): 89875,
 ('C', 'A'): 59561,
 ('C', 'C'): 119710,
 ('C', 'G'): 239052,
 ('C', 'T'): 179214,
 ('G', 'A'): 119375,
 ('G', 'C'): 238655,
 ('G', 'G'): 477370,
 ('G', 'T'): 359405,
 ('T', 'A'): 89962,
 ('T', 'C'): 179648,
 ('T', 'G'): 358862,
 ('T', 'T'): 270335}

Working with out-of-core memory

Using memmap


In [ ]:

Using HDF5


In [ ]:

Using SQLite3


In [ ]:

Out-of-memory data conversions


In [ ]:

Probabilistic data structures

Bloom filters


In [ ]:

Small-scale distributed programming

Using dask


In [ ]:

dask arrays


In [ ]:

dask bags


In [ ]:

dask data frames


In [ ]:


In [ ]:


In [ ]:


In [ ]: