In [ ]:
from IPython.display import YouTubeVideo
YouTubeVideo('PpBK4zIaFLE')
In [ ]:
!ls ~/repos/open-source-science-workshop/sessions/2014-06-16/yeast-genome
In [ ]:
from glob import glob
from toolz.curried import map, drop, concat, take, pipe, sorted
In [ ]:
def genome(file_pattern):
operations = (
glob, # Get matching filenames.
sorted, # Sort em.
map(open), # open each file.
map(drop(1)), # drop first line of each file.
concat, # concat all files together.
map(str.upper), # Uppercase each line.
map(str.strip), # Strip off whitespace from each line.
concat, # Concatenate all lines into one long sequence.
)
return pipe(file_pattern, *operations)
In [ ]:
path_prefix = '/home/hen/repos/open-source-science-workshop/sessions/2014-06-16/'
path_glob = path_prefix + 'yeast-genome/chr*.fa'
list(take(10, genome(path_glob)))
In [ ]:
glob(path_glob)
In [ ]:
files = pipe(path_glob, glob, sorted, list)
In [ ]:
pipe(files[0], open, take(5), ''.join, print)
In [ ]:
%%time
list(take(10, genome(path_glob)))
In [ ]:
%%time
list(take(10000000, genome(path_glob)))
In [ ]:
from toolz.curried import (
sliding_window,
frequencies,
merge_with,
merge,
curry,
)
def get_transition(item):
"""
Transform the number of times a given sequence of len n has occured
into a dictionary with the first n-1 items as the key
and the value another dictionary mapping the last item to the count.
"""
seq, count = item
*previous, last = seq
return {tuple(previous): {last: count}}
@curry
def markov(n, seq):
"""
Perform a markov-chain analysis on seq.
Use last n elements to define a history.
"""
operations = (
sliding_window(n + 1), # Get each successive sequence.
frequencies, # Count occurences.
dict.items, # Get triple-count pairs.
map(get_transition), # Reshape each.
merge_with(merge), # Merge dicts from different pairs.
)
return pipe(seq, *operations)
In [ ]:
%%time
pipe(path_glob, genome, markov(2))
Note that markov has nothing to do with genome.
They work because of the shared interface, the basic Python data structures.
In [ ]:
file_path = path_prefix + 'tale-of-two-cities.txt'
In [ ]:
!head $file_path -n 25
In [ ]:
def stem(word):
""" Stem word to primitive form
>>> stem("Hello!")
'hello'
"""
return word.lower().rstrip(",.!)-*_?:;$'-\"").lstrip("-*'\"(_$'")
def lines_to_words(lines):
"""
Convert a stream of lines to a stream of stemmed words.
"""
operations = (
map(str.split),
concat,
map(stem),
)
return pipe(lines, *operations)
In [ ]:
%%time
pipe(file_path, open, drop(25), lines_to_words, markov(1))
In [ ]:
from cytoolz.curried import (
map,
sliding_window,
frequencies,
merge_with,
merge,
pipe,
curry,
drop,
take,
sorted,
concat,
)
@curry
def markov(n, seq):
"""
Perform a markov-chain analysis on seq.
Use last n elements to define a history.
"""
operations = (
sliding_window(n + 1), # Get each successive sequence.
frequencies, # Count occurences.
dict.items, # Get triple-count pairs.
map(get_transition), # Reshape each.
merge_with(merge), # Merge dicts from different pairs.
)
return pipe(seq, *operations)
def lines_to_words(lines):
"""
Convert a stream of lines to a stream of stemmed words.
"""
operations = (
map(str.split),
concat,
map(stem),
)
return pipe(lines, *operations)
def genome(file_pattern):
operations = (
glob, # Get matching filenames.
sorted, # Sort em.
map(open), # open each file.
map(drop(1)), # drop first line of each file.
concat, # concat all files together.
map(str.upper), # Uppercase each line.
map(str.strip), # Strip off whitespace from each line.
concat, # Concatenate all lines into one long sequence.
)
return pipe(file_pattern, *operations)
In [ ]:
%%time
pipe(file_path, open, drop(25), lines_to_words, markov(1))
In [ ]:
%%time
pipe(path_glob, genome, markov(2))