In [ ]:
# Imports
import itertools as it
import operator as op
import numpy as np
from collections import Counter
# Definitions
# TODO: write script to...
# Parse a DSK file, use verbose mode to create .pre and .packed,
# then create the graph and print out labels after,
# then traverse with simple paths
#run dsk's parse_results
#dsk_filename = "k27_d3_test.parsed"
#dsk_filename = "k35_d3_test.parsed"
dsk_filename = "ch14_k56.dsk.parsed"
#run cosmo-pack with verbose set to on
#pre_filename = "k27_d3_test.pre"
#pre_filename = "k35_d3_test.pre"
pre_filename = "ch14.pre"
#packed_filename = "k27_d3_test.solid_kmers_binary.packed"
#packed_filename = "k35_d3_test.solid_kmers_binary.packed"
packed_filename = "chrom14_d5_k56.solid_kmers_binary.packed"
#run cosmo-assemble on the dbg file with verbose set to on.
#dbg_filename = "parsed_dbg
#dbg_filename = "k35_d3_test.dbg_parse"
dbg_filename = "ch14.dbg.parsed"
# TODO: try again for larger k, linear graphs and cyclic strings
def complement(x):
from string import maketrans
table = maketrans("ACGTacgt","TGCAtgca")
return x.translate(table)
flatten = lambda list_of_lists: it.chain.from_iterable(list_of_lists)
reverse = lambda x: x[::-1]
twin = lambda x: reverse(complement(x))
one_of = lambda xs: next(iter(xs))
edge = lambda x: x[-1]
node = lambda x: x[:-1]
suffix = lambda x: x[1:]
succ = lambda x: "$" + x[0:-1] # assumes k > 0
graph_key = lambda x: (reverse(node(x)), edge(x))
# Take a set of dummy edges for a given level, and generate the set of all successors
# e.g. {$acg, $act} -> { $$ac }
# If we check for an equality of the set, it also shows us that all lower levels have predecessors
def dummy_successors(level_set):
return set(succ(x) for x in level_set)
def group_suffixes(kmers):
return it.groupby(kmers, key=lambda kmer: suffix(node(kmer)))
def edge_flagger(flag):
def flag_edge((suffix, kmers)):
# this set will grow to a maximum of the size of the alphabet
seen = set()
for kmer in kmers:
e = edge(kmer)
yield not flag if e in seen else flag
seen.add(e)
return flag_edge
def pairs(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = it.tee(iterable)
next(b, None)
return it.izip(a, b)
def is_sorted(xs, key=lambda x: x, comp=op.le):
return all(comp(key(a),key(b)) for a,b in pairs(xs))
W = 64
def get_element_from_block(block, i, r=5):
return (int(block) & (31 << (r * ((W/r)-i-1)))) >> (r * ((W/r)-i-1))
get_sym = lambda x: "$ACGT"[x >> 2]
get_node_flag = lambda x: (x & 2) >> 1
get_edge_flag = lambda x: (x & 1)
def get_element(blocks, i, r=5):
block_idx = i/(W/r)
return get_element_from_block(blocks[block_idx], i%(W/r))
In [ ]:
# Load Data
with open(dsk_filename) as f:
dsk_kmers = set([x.upper() for x,_ in (line.split() for line in f.readlines())])
with open(pre_filename) as f:
our_data = [(x, int(y), int(z)) for x,y,z in (line.split() for line in f.readlines())] # kmer, node flag, edge flag triples
k = len(one_of(dsk_kmers))
first_node_flag = our_data[0][1]
first_edge_flag = our_data[0][2]
# Get important parts of our structure
ordered_kmers = [x.upper() for x,_,_ in our_data]
our_kmers = set(ordered_kmers)
#node_flags = [y for _,y,_ in our_data]
#edge_flags = [z for _,_,z in our_data]
start_nodes = Counter(kmer[ :-1] for kmer in our_kmers if "$" not in kmer[ :-1])
end_nodes = Counter(kmer[1: ] for kmer in our_kmers if "$" not in kmer[1: ])
out_dummies = [kmer[ :-1] for kmer in our_kmers if kmer[-1] == "$"]
in_dummies = [kmer[1: ] for kmer in our_kmers if kmer[0] == "$" and "$" not in kmer[1:]]
# Extract dummy tree
in_dummy_tree_edges = sorted((x for x in our_kmers if x[0] == "$"), key=lambda x: x.count("$"))
# Can use set since we will have already checked for duplicates
in_dummy_tree = {d:set(e) for d,e in it.groupby(in_dummy_tree_edges, lambda x: x.count("$"))}
In [ ]:
# Tests
assert k > 0
# Check for same length
assert len(one_of(our_kmers)) == k
# No Duplicates
assert len(our_kmers) == len(our_data)
In [ ]:
# Check that our kmers are all in DSK
for kmer in our_kmers:
assert "$" in kmer or kmer in dsk_kmers or twin(kmer) in dsk_kmers
In [ ]:
# Check that DSK completely represented
# And that we have all twins
for kmer in dsk_kmers:
assert kmer in our_kmers and twin(kmer) in our_kmers
In [ ]:
# TODO
# Every start node is also an end node
for v in start_nodes:
assert v in end_nodes, "%s not in end_nodes" % (v)
# Every end node is also a start node
for v in end_nodes:
assert v in start_nodes, "%s not in start_nodes" % (v)
# All out-dummies have only one outgoing edge for their node
for v in out_dummies:
assert start_nodes[v] == 1
# We didn't add any un-necessary in-dummies (that already had a predecessor)
for v in in_dummies:
assert end_nodes[v] == 1
# In-dummies are never out-dummies
for v in in_dummies:
assert v[-1] != "$"
# Out-dummies are never in-dummies
for v in out_dummies:
assert v[0] != "$"
In [ ]:
# Check that all in-dummies form a connected tree
for depth in xrange(1,k-1):
assert dummy_successors(in_dummy_tree[depth]) == in_dummy_tree[depth+1]
In [ ]:
# Check first node flag
prev_node = None
for num, (row, first, _) in enumerate(our_data):
assert (node(row) != prev_node) == (first == first_node_flag), "%s %s %s %s %s" % (num, row, first)
prev_node = node(row)
In [ ]:
# Check edge flag
expected_edge_flags = flatten(it.imap(edge_flagger(first_edge_flag), group_suffixes(ordered_kmers)))
for num, (expected_flag, (row, _, flag)) in enumerate(it.izip(expected_edge_flags, our_data)):
assert (flag == expected_flag), "%s: %s, expected %s, got %s" %(num, row, expected_flag, flag)
In [ ]:
# Check sorted order (map key and assert is_sorted)
assert is_sorted(ordered_kmers, key=graph_key)
In [ ]:
# Load packed rep
with open(packed_filename) as f:
packed_data = np.fromfile(f, dtype=np.uint64)
packed_k = packed_data[-1]
counts = packed_data[-6:-1]
packed_len = counts[-1]
blocks = packed_data[:-6]
# Check k values
assert packed_k == k, "%s != %s" % (packed_k, k)
# isolate f table and w vector
f_col = [x[-2] for x in ordered_kmers]
w_col = [x[-1] for x in ordered_kmers]
In [ ]:
# Check symbol counts (same as f?)
assert is_sorted(f_col)
from collections import Counter
expected_counts = sorted(Counter(f_col).items())
assert len(expected_counts) == 5
assert packed_len == len(ordered_kmers), "%s != %s" % (packed_len, len(ordered_kmers))
total = 0
for idx, (_, count) in enumerate(expected_counts):
total += count
assert total == counts[idx]
# TODO: add check for 1:1 relationship between f and w (first and minus flags)
# Check packed W and flags
expected_node_flags = (y for _,y,_ in our_data)
expected_edge_flags = (z for _,_,z in our_data)
packed_node_flags = (get_node_flag(get_element(blocks, i)) for i in xrange(packed_len))
packed_edge_flags = (get_edge_flag(get_element(blocks, i)) for i in xrange(packed_len))
packed_symbols = (get_sym(get_element(blocks, i)) for i in xrange(packed_len))
assert all(a == b for a,b in it.izip(packed_node_flags, expected_node_flags))
assert all(a == b for a,b in it.izip(packed_edge_flags, expected_edge_flags))
assert all(a == b for a,b in it.izip(w_col, packed_symbols))
In [ ]:
with open(dbg_filename) as f:
dbg = [(x.rstrip(), 1 - int(y), 1 - int(z)) for x,y,z in (row.split() for row in f.readlines())]
# Check dbg (W and flags are the same)
dbg_symbols = (x for x,_,_ in dbg)
dbg_node_flags = (y for _,y,_ in dbg)
dbg_edge_flags = (z for _,_,z in dbg)
assert all(a == b for a,b in it.izip(w_col, dbg_symbols))
assert all(a == b for a,b in it.izip(expected_node_flags, dbg_node_flags))
assert all(a == b for a,b in it.izip(expected_edge_flags, dbg_edge_flags))
# TODO: Check counts, k, etc...
# TODO: Check results of functions (edge label...)
# TODO: Check contigs (every dsk kmer XOR its twin appears once and once only)
In [ ]:
In [ ]: