In [1]:
from goetia import libgoetia
#from goetia.hashing import UKHS
#from goetia.hashing import RollingHashShifter
from debruijnal_enhance_o_tron.sequence import kmers
from debruijnal_enhance_o_tron.sequence import get_random_sequence
In [2]:
import bbhash
In [3]:
from goetia.data import load_unikmer_map, parse_unikmers
In [4]:
from cppyy import gbl
import cppyy
In [5]:
hash_cyclic = libgoetia.hashing.hash_cyclic
In [6]:
unikmers = parse_unikmers(27, 7)
In [7]:
seq = 'ACAGTACAGTCAAGCGATACTCCGAGCTTAACCGGACAACGGTTCTCGCCTATCGCCCTTCACCTCAATTGGG'
print(seq[:27])
print(seq[27:])
In [8]:
unikmer_map = load_unikmer_map(27, 7)
shifter = libgoetia.hashing.UKHS.LazyShifter(27, 7, unikmer_map)
In [9]:
g = libgoetia.PdBG[libgoetia.storage.SparseppSetStorage](27, 7, unikmer_map.__smartptr__())
In [10]:
hashes = g.get_hashes(seq)
In [11]:
def get_min_unikmer(wmer, uk_map):
wmer_hash = libgoetia.hashing.hash_cyclic(wmer, len(wmer))
kmer_hashes = [libgoetia.hashing.hash_cyclic(kmer, uk_map.K) for kmer in kmers(wmer, uk_map.K)]
unikmers = []
for h in kmer_hashes:
unikmer = libgoetia.hashing.UKHS.Unikmer(h)
if uk_map.query(unikmer):
unikmers.append(unikmer)
#print(kmer_hashes)
#print([str(u) for u in unikmers])
return wmer_hash, min(unikmers, key=lambda elem: elem.hash)
In [12]:
get_min_unikmer(seq[35:35+27], unikmer_map)
Out[12]:
In [14]:
for i, (kmer, second) in enumerate(zip(kmers(seq, 27), hashes)):
first = g.hash(kmer)
print(i)
print(first.hash, second.hash)
print(first.unikmer, second.unikmer)
assert first.unikmer.hash == second.unikmer.hash
print(get_min_unikmer(kmer, unikmer_map))
print('---')
In [9]:
uhashes = list(unikmer_map.get_hashes())
In [15]:
for kmer in kmers(seq, 27):
print(hash_cyclic(kmer, 27))
In [21]:
uks = []
for i, kmer in enumerate(kmers(seq[:31], 7)):
h = libgoetia.hashing.hash_cyclic(kmer, 7)
if i == 20:
print('---')
print(min(uks))
print(h)
print(h in uhashes)
if h in uhashes:
uks.append(h)
print(min(uks))
In [30]:
libgoetia.hashing.hash_cyclic(seq[:27], 27)
Out[30]:
In [18]:
it = libgoetia.hashing.KmerIterator[libgoetia.hashing.UKHS.LazyShifter](seq, shifter)
i = 0
while not it.done():
h = it.next()
print(i, h.hash, h.unikmer.hash, h.unikmer.partition)
i += 1
In [4]:
ukhs.find_unikmers(seq)
Out[4]:
In [6]:
ukhs.set_cursor(seq)
In [7]:
seq
Out[7]:
In [2]:
class dBG:
'''A basic de Bruijn Graph.
'''
def __init__(self, ksize, Sigma='ACGT', *args, **kwargs):
self.store = set()
self.tags = set()
self.ksize = ksize
self.Sigma = Sigma
self.Sigma_set = set(self.Sigma)
self.Sigma_list = list(self.Sigma)
def kmers(self, sequence):
'''Yields the k-mers in the given sequence.
'''
for i in range(len(sequence) - self.ksize + 1):
yield sequence[i:i+self.ksize]
def reset(self):
'''Reset the dBG to an empty state.
'''
self.store.clear()
def get(self, item):
'''Return true if the item is in the dBG, false otherwise
'''
return item in self.store
def add(self, item):
'''Add the k-mer(s) from the given sequence to the dBG.
'''
if len(item) < self.ksize:
raise ValueError(item)
elif len(item) == self.ksize:
self.store.add(item)
else:
self.store.update(self.kmers(item))
def tag(self, item, dist=3):
if len(item) < self.ksize:
raise ValueError(item)
count = 0
n_tagged = 0
for kmer in self.kmers(item):
if kmer in self.tags:
count = 0
else:
count += 1
if count > dist:
self.tags.add(kmer)
count = 0
n_tagged += 1
return n_tagged
def remove(self, item):
'''Remove the k-mer(s) from the given sequence from the dBG.
'''
if len(item) < self.ksize:
raise ValueError(item)
elif len(item) == self.ksize:
self.store.remove(item)
else:
self.store.difference_update(self.kmers(item))
def suffix(self, sequence):
'''Returns the length k-1 suffix of the sequence.
'''
if len(sequence) < self.ksize:
raise ValueError('Sequence too short')
return sequence[-self.ksize+1:]
def prefix(self, sequence):
'''Returns the length k-1 prefix of the sequence.
'''
if len(sequence) < self.ksize:
raise ValueError('Sequence too short')
return sequence[:self.ksize-1]
def left_neighbors(self, item):
'''Yields the four left neighbors of the given k-mer.
'''
prefix = self.prefix(item)
for b in self.Sigma:
yield b + prefix
def right_neighbors(self, item):
'''Yields the four right neighbors of the given k-mer.
'''
suffix = self.suffix(item)
for b in self.Sigma:
yield suffix + b
def left_degree(self, item):
'''Yields the in-degree (left degree) of the given k-mer.
'''
return sum((self.get(neighbor) for neighbor in self.left_neighbors(item)))
def right_degree(self, item):
'''Yields the out-degree (right degree) of the given k-mer.
'''
return sum((self.get(neighbor) for neighbor in self.right_neighbors(item)))
def degree(self, item):
return self.left_degree(item), self.right_degree(item)
def is_decision(self, kmer):
'''Returns True if the given k-mer is a decision k-mer in the dBG:
that is, if its in-degree or out-degree is greater than one.
'''
return self.left_degree(kmer) > 1 or self.right_degree(kmer) > 1
def find_decisions(self, sequence):
'''Yields the position and k-mer sequence of the decision k-mers in the
given sequence.
'''
for n, kmer in enumerate(self.kmers(sequence)):
if self.is_decision(kmer):
yield n, kmer
In [15]:
for kmer in kmers(seq, 7):
print('(', hasher.hash(kmer), ', ', ukhs.query(hasher.hash(kmer)), ')', sep='')
In [8]:
ukhs.find_unikmers(seq)
Out[8]:
In [6]:
c_ukhs = ukhs.hashes
In [7]:
mph = bbhash.PyMPHF(c_ukhs, len(c_ukhs), 1, 2.0)
In [12]:
for kmer in kmers(seq, 7):
h = hasher.hash(kmer)
print(h, kmer, ukhs.query(h), h in c_ukhs, mph.lookup(h))
In [13]:
ukhs.set_cursor(seq)
In [14]:
ukhs.unikmer
Out[14]:
In [15]:
ukhs.partition
Out[15]:
In [16]:
seq
Out[16]:
In [17]:
ukhs.hashvalue
Out[17]:
In [45]:
hpp = ''
cpp = ''
for W in range(20, 210, 10):
for K in range(7, 10):
cpp += ' const std::vector<std::string> UKHSData::W{0}_K{1} = {{'.format(W, K)
hpp += ' static const std::vector<std::string> W{0}_K{1};\n'.format(W, K)
with open('../goetia/ukhs-data/res_{0}_{1}_4_0.txt'.format(K, W)) as fp:
for n, line in enumerate(fp):
kmer = line.strip()
if n != 0:
cpp += ', '
cpp += '"{0}"'.format(kmer)
cpp += '};\n'
In [40]:
print(hpp)
In [36]:
func = 'const std::vector<std::string>& get_ukhs(int W, int K) {\n if '
for W in range(20, 210, 10):
for K in range(7, 10):
func += '(W == {W} && K == {K}) {{\n return W{W}_K{K};\n }} else if '.format(W=W, K=K)
func += '\n}'
In [37]:
print(func)
In [46]:
with open('defs.cc', 'w') as fp:
print(cpp, file=fp)
In [ ]: