DNA Markov chain

This example also taken from Matthew Rocklin:


In [ ]:
from IPython.display import YouTubeVideo
YouTubeVideo('PpBK4zIaFLE')

In [ ]:
!ls ~/repos/open-source-science-workshop/sessions/2014-06-16/yeast-genome

In [ ]:
from glob import glob
from toolz.curried import map, drop, concat, take, pipe, sorted

In [ ]:
def genome(file_pattern):
    operations = (
        glob,            # Get matching filenames.
        sorted,    # Sort em.
        map(open),       # open each file.
        map(drop(1)),    # drop first line of each file.
        concat,          # concat all files together.
        map(str.upper),  # Uppercase each line.
        map(str.strip),  # Strip off whitespace from each line.
        concat,          # Concatenate all lines into one long sequence.
    )
    return pipe(file_pattern, *operations)

In [ ]:
path_prefix = '/home/hen/repos/open-source-science-workshop/sessions/2014-06-16/'
path_glob = path_prefix + 'yeast-genome/chr*.fa'

list(take(10, genome(path_glob)))

In [ ]:
glob(path_glob)

In [ ]:
files = pipe(path_glob, glob, sorted, list)

In [ ]:
pipe(files[0], open, take(5), ''.join, print)

Laziness

We only process what we need:


In [ ]:
%%time

list(take(10, genome(path_glob)))

In [ ]:
%%time

list(take(10000000, genome(path_glob)))

Markov analysis


In [ ]:
from toolz.curried import (
    sliding_window,
    frequencies,
    merge_with,
    merge,
    curry,
)

def get_transition(item):
    """
    Transform the number of times a given sequence of len n has occured
    into a dictionary with the first n-1 items as the key
    and the value another dictionary mapping the last item to the count.
    
    """
    seq, count = item
    *previous, last = seq
    return {tuple(previous): {last: count}}


@curry
def markov(n, seq):
    """
    Perform a markov-chain analysis on seq.
    Use last n elements to define a history.
    
    """
    operations = (
        sliding_window(n + 1), # Get each successive sequence.
        frequencies,           # Count occurences.
        dict.items,            # Get triple-count pairs.
        map(get_transition),   # Reshape each.
        merge_with(merge),     # Merge dicts from different pairs.
)
    return pipe(seq, *operations)

In [ ]:
%%time

pipe(path_glob, genome, markov(2))

Note that markov has nothing to do with genome. They work because of the shared interface, the basic Python data structures.


In [ ]:
file_path = path_prefix + 'tale-of-two-cities.txt'

In [ ]:
!head $file_path -n 25

In [ ]:
def stem(word):
    """ Stem word to primitive form 
    
    >>> stem("Hello!")
    'hello'
    """
    return word.lower().rstrip(",.!)-*_?:;$'-\"").lstrip("-*'\"(_$'")

def lines_to_words(lines):
    """
    Convert a stream of lines to a stream of stemmed words.
    """
    operations = (
        map(str.split),
        concat,
        map(stem),
)
    return pipe(lines, *operations)

In [ ]:
%%time

pipe(file_path, open, drop(25), lines_to_words, markov(1))

cytoolz

There is a library called cython that essentially lets us write C code in Python. toolz has been implement in cython as cytoolz. Let's try the above with cytoolz and see what kind of improvement we get.

This is about the end of the line in terms of performance if you're not doing numerics.


In [ ]:
from cytoolz.curried import (
    map,
    sliding_window,
    frequencies,
    merge_with,
    merge,
    pipe,
    curry,
    drop,
    take,
    sorted,
    concat,
)

@curry
def markov(n, seq):
    """
    Perform a markov-chain analysis on seq.
    Use last n elements to define a history.
    
    """
    operations = (
        sliding_window(n + 1), # Get each successive sequence.
        frequencies,           # Count occurences.
        dict.items,            # Get triple-count pairs.
        map(get_transition),   # Reshape each.
        merge_with(merge),     # Merge dicts from different pairs.
)
    return pipe(seq, *operations)


def lines_to_words(lines):
    """
    Convert a stream of lines to a stream of stemmed words.
    """
    operations = (
        map(str.split),
        concat,
        map(stem),
)
    return pipe(lines, *operations)


def genome(file_pattern):
    operations = (
        glob,            # Get matching filenames.
        sorted,    # Sort em.
        map(open),       # open each file.
        map(drop(1)),    # drop first line of each file.
        concat,          # concat all files together.
        map(str.upper),  # Uppercase each line.
        map(str.strip),  # Strip off whitespace from each line.
        concat,          # Concatenate all lines into one long sequence.
    )
    return pipe(file_pattern, *operations)

In [ ]:
%%time

pipe(file_path, open, drop(25), lines_to_words, markov(1))

In [ ]:
%%time

pipe(path_glob, genome, markov(2))