In [1]:
from goetia import libgoetia
#from goetia.hashing import UKHS
#from goetia.hashing import RollingHashShifter
from debruijnal_enhance_o_tron.sequence import kmers
from debruijnal_enhance_o_tron.sequence import get_random_sequence


BEGIN DECLS

In [2]:
import bbhash

In [3]:
from goetia.data import load_unikmer_map, parse_unikmers

In [4]:
from cppyy import gbl
import cppyy

In [5]:
hash_cyclic = libgoetia.hashing.hash_cyclic

In [6]:
unikmers = parse_unikmers(27, 7)

In [7]:
seq = 'ACAGTACAGTCAAGCGATACTCCGAGCTTAACCGGACAACGGTTCTCGCCTATCGCCCTTCACCTCAATTGGG'
print(seq[:27])
print(seq[27:])


ACAGTACAGTCAAGCGATACTCCGAGC
TTAACCGGACAACGGTTCTCGCCTATCGCCCTTCACCTCAATTGGG

In [8]:
unikmer_map = load_unikmer_map(27, 7)
shifter = libgoetia.hashing.UKHS.LazyShifter(27, 7, unikmer_map)

In [9]:
g = libgoetia.PdBG[libgoetia.storage.SparseppSetStorage](27, 7, unikmer_map.__smartptr__())

In [10]:
hashes = g.get_hashes(seq)

In [11]:
def get_min_unikmer(wmer, uk_map):
    wmer_hash = libgoetia.hashing.hash_cyclic(wmer, len(wmer))
    kmer_hashes = [libgoetia.hashing.hash_cyclic(kmer, uk_map.K) for kmer in kmers(wmer, uk_map.K)]
    unikmers = []
    for h in kmer_hashes:
        unikmer = libgoetia.hashing.UKHS.Unikmer(h)
        if uk_map.query(unikmer):
            unikmers.append(unikmer)
    #print(kmer_hashes)
    #print([str(u) for u in unikmers])
    return wmer_hash, min(unikmers, key=lambda elem: elem.hash)

In [12]:
get_min_unikmer(seq[35:35+27], unikmer_map)


Out[12]:
(15871885999159907979, <Unikmer h=742322610132807068 p=483>)

In [14]:
for i, (kmer, second) in enumerate(zip(kmers(seq, 27), hashes)):
    first = g.hash(kmer)
    print(i)
    print(first.hash, second.hash)
    print(first.unikmer, second.unikmer)
    assert first.unikmer.hash == second.unikmer.hash
    print(get_min_unikmer(kmer, unikmer_map))
    print('---')


0
14637754006719005196 14637754006719005196
<Unikmer h=26644528043774785 p=3264> <Unikmer h=26644528043774785 p=3264>
(14637754006719005196, <Unikmer h=26644528043774785 p=3264>)
---
1
11599635404035127656 11599635404035127656
<Unikmer h=26644528043774785 p=3264> <Unikmer h=26644528043774785 p=3264>
(11599635404035127656, <Unikmer h=26644528043774785 p=3264>)
---
2
5304066999588069142 5304066999588069142
<Unikmer h=3374902433127799277 p=1075> <Unikmer h=3374902433127799277 p=1075>
(5304066999588069142, <Unikmer h=3374902433127799277 p=1075>)
---
3
5283529904221843974 5283529904221843974
<Unikmer h=3374902433127799277 p=1075> <Unikmer h=3374902433127799277 p=1075>
(5283529904221843974, <Unikmer h=3374902433127799277 p=1075>)
---
4
3646563514192044101 3646563514192044101
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(3646563514192044101, <Unikmer h=923109349927235793 p=1331>)
---
5
10003247902999664774 10003247902999664774
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(10003247902999664774, <Unikmer h=923109349927235793 p=1331>)
---
6
17722290116046448691 17722290116046448691
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(17722290116046448691, <Unikmer h=923109349927235793 p=1331>)
---
7
7679432260906167420 7679432260906167420
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(7679432260906167420, <Unikmer h=923109349927235793 p=1331>)
---
8
7674659716255888469 7674659716255888469
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(7674659716255888469, <Unikmer h=923109349927235793 p=1331>)
---
9
8447963904819834083 8447963904819834083
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(8447963904819834083, <Unikmer h=923109349927235793 p=1331>)
---
10
404950410254762442 404950410254762442
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(404950410254762442, <Unikmer h=923109349927235793 p=1331>)
---
11
17260881163556085512 17260881163556085512
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(17260881163556085512, <Unikmer h=923109349927235793 p=1331>)
---
12
391980499709686331 391980499709686331
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(391980499709686331, <Unikmer h=923109349927235793 p=1331>)
---
13
16912795032406539592 16912795032406539592
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(16912795032406539592, <Unikmer h=923109349927235793 p=1331>)
---
14
1194333815776190047 1194333815776190047
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(1194333815776190047, <Unikmer h=923109349927235793 p=1331>)
---
15
11551983283636093093 11551983283636093093
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(11551983283636093093, <Unikmer h=923109349927235793 p=1331>)
---
16
884798678176461913 884798678176461913
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(884798678176461913, <Unikmer h=923109349927235793 p=1331>)
---
17
3329161728876287427 3329161728876287427
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(3329161728876287427, <Unikmer h=923109349927235793 p=1331>)
---
18
12935417283092362122 12935417283092362122
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(12935417283092362122, <Unikmer h=923109349927235793 p=1331>)
---
19
5889295890019752548 5889295890019752548
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(5889295890019752548, <Unikmer h=923109349927235793 p=1331>)
---
20
9074840525637778752 9074840525637778752
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(9074840525637778752, <Unikmer h=923109349927235793 p=1331>)
---
21
5473928510598534943 5473928510598534943
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(5473928510598534943, <Unikmer h=923109349927235793 p=1331>)
---
22
5290357668946765750 5290357668946765750
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(5290357668946765750, <Unikmer h=923109349927235793 p=1331>)
---
23
5500013604579673828 5500013604579673828
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(5500013604579673828, <Unikmer h=923109349927235793 p=1331>)
---
24
15294605780244822234 15294605780244822234
<Unikmer h=923109349927235793 p=1331> <Unikmer h=923109349927235793 p=1331>
(15294605780244822234, <Unikmer h=923109349927235793 p=1331>)
---
25
8279799774688394655 8279799774688394655
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(8279799774688394655, <Unikmer h=4760973827227662090 p=352>)
---
26
12189322892800003628 12189322892800003628
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(12189322892800003628, <Unikmer h=4760973827227662090 p=352>)
---
27
10148827247862175185 10148827247862175185
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(10148827247862175185, <Unikmer h=4760973827227662090 p=352>)
---
28
12229545695231837756 12229545695231837756
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(12229545695231837756, <Unikmer h=4760973827227662090 p=352>)
---
29
13588748295235310709 13588748295235310709
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(13588748295235310709, <Unikmer h=4760973827227662090 p=352>)
---
30
11060484655501799893 11060484655501799893
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(11060484655501799893, <Unikmer h=4760973827227662090 p=352>)
---
31
15179973512562227861 15179973512562227861
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(15179973512562227861, <Unikmer h=4760973827227662090 p=352>)
---
32
12482777229194172652 12482777229194172652
<Unikmer h=4760973827227662090 p=352> <Unikmer h=4760973827227662090 p=352>
(12482777229194172652, <Unikmer h=4760973827227662090 p=352>)
---
33
5915758169474489374 5915758169474489374
<Unikmer h=12274243358220278312 p=1203> <Unikmer h=12274243358220278312 p=1203>
(5915758169474489374, <Unikmer h=12274243358220278312 p=1203>)
---
34
4484852343624788321 4484852343624788321
<Unikmer h=12274243358220278312 p=1203> <Unikmer h=12274243358220278312 p=1203>
(4484852343624788321, <Unikmer h=12274243358220278312 p=1203>)
---
35
15871885999159907979 15871885999159907979
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(15871885999159907979, <Unikmer h=742322610132807068 p=483>)
---
36
6404093841576293417 6404093841576293417
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(6404093841576293417, <Unikmer h=742322610132807068 p=483>)
---
37
8014836579513508314 8014836579513508314
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(8014836579513508314, <Unikmer h=742322610132807068 p=483>)
---
38
16774087066814693061 16774087066814693061
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(16774087066814693061, <Unikmer h=742322610132807068 p=483>)
---
39
3586114045941294261 3586114045941294261
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(3586114045941294261, <Unikmer h=742322610132807068 p=483>)
---
40
9745631285740836342 9745631285740836342
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(9745631285740836342, <Unikmer h=742322610132807068 p=483>)
---
41
12556157603925651364 12556157603925651364
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(12556157603925651364, <Unikmer h=742322610132807068 p=483>)
---
42
1181760844555233883 1181760844555233883
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(1181760844555233883, <Unikmer h=742322610132807068 p=483>)
---
43
1856066206946519285 1856066206946519285
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(1856066206946519285, <Unikmer h=742322610132807068 p=483>)
---
44
9481193187549641845 9481193187549641845
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(9481193187549641845, <Unikmer h=742322610132807068 p=483>)
---
45
9678603839585486064 9678603839585486064
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(9678603839585486064, <Unikmer h=742322610132807068 p=483>)
---
46
13597862509905323134 13597862509905323134
<Unikmer h=742322610132807068 p=483> <Unikmer h=742322610132807068 p=483>
(13597862509905323134, <Unikmer h=742322610132807068 p=483>)
---

In [9]:
uhashes = list(unikmer_map.get_hashes())

In [15]:
for kmer in kmers(seq, 27):
    print(hash_cyclic(kmer, 27))


14637754006719005196
11599635404035127656
5304066999588069142
5283529904221843974
3646563514192044101
10003247902999664774
17722290116046448691
7679432260906167420
7674659716255888469
8447963904819834083
404950410254762442
17260881163556085512
391980499709686331
16912795032406539592
1194333815776190047
11551983283636093093
884798678176461913
3329161728876287427
12935417283092362122
5889295890019752548
9074840525637778752
5473928510598534943
5290357668946765750
5500013604579673828
15294605780244822234
8279799774688394655
12189322892800003628
10148827247862175185
12229545695231837756
13588748295235310709
11060484655501799893
15179973512562227861
12482777229194172652
5915758169474489374
4484852343624788321
15871885999159907979
6404093841576293417
8014836579513508314
16774087066814693061
3586114045941294261
9745631285740836342
12556157603925651364
1181760844555233883
1856066206946519285
9481193187549641845
9678603839585486064
13597862509905323134

In [21]:
uks = []
for i, kmer in enumerate(kmers(seq[:31], 7)):
    h = libgoetia.hashing.hash_cyclic(kmer, 7)
    if i == 20:
        print('---')
        print(min(uks))
    print(h)
    print(h in uhashes)
    if h in uhashes:
        uks.append(h)
print(min(uks))


2555500102223218697
False
26644528043774785
True
4234649233007660875
False
16132908444298833054
False
2576154049931496648
False
7863512023346214709
False
11309347485062507833
True
260243592574514106
False
8914998625987408179
False
3784318189051200000
False
4875516863775730341
False
3761251956699049311
True
3374902433127799277
True
2390650065958888861
False
628186532513249920
False
8423348798003030362
False
8488192121313838913
False
14477513130367315286
False
13507571367634154504
True
6168629487121468357
False
---
26644528043774785
14896931290667994064
False
6557614920362575967
False
388065157563181227
False
6160219572893194264
False
923109349927235793
True
26644528043774785

In [30]:
libgoetia.hashing.hash_cyclic(seq[:27], 27)


Out[30]:
14637754006719005196

In [18]:
it = libgoetia.hashing.KmerIterator[libgoetia.hashing.UKHS.LazyShifter](seq, shifter)
i = 0
while not it.done():
    h = it.next()
    print(i, h.hash, h.unikmer.hash, h.unikmer.partition)
    i += 1


0 8902978578492780380 26644528043774785 3264
1 188617283967906032 26644528043774785 3264
2 2900901382139513322 3374902433127799277 1075
3 18082607458454997007 3374902433127799277 1075
4 13239091568011809634 3374902433127799277 1075
5 954163418890348058 3374902433127799277 1075
6 17293489416741216892 3374902433127799277 1075
7 2623830298610141372 3374902433127799277 1075
8 7763135789723687807 3374902433127799277 1075
9 10747402100642592131 3374902433127799277 1075
10 3298138176754731871 923109349927235793 1331
11 8571626476299878068 923109349927235793 1331
12 1759918312005489440 923109349927235793 1331
13 11146004114706840987 923109349927235793 1331
14 1351651595587475661 923109349927235793 1331
15 3659244596159892612 923109349927235793 1331
16 2160739476999507809 923109349927235793 1331
17 17604431492878837150 923109349927235793 1331
18 17228159651490556773 923109349927235793 1331
19 3145766553153894531 923109349927235793 1331
20 2728398619066739871 923109349927235793 1331
21 3022700373863029217 923109349927235793 1331
22 12137629589093220231 923109349927235793 1331
23 12355376700550001482 923109349927235793 1331
24 3351304249650639100 923109349927235793 1331
25 9109427135317470207 4760973827227662090 352
26 9564059358962401687 4760973827227662090 352
27 8933319429867888310 4760973827227662090 352
28 14576091636067272736 4760973827227662090 352
29 12189836022296782093 4760973827227662090 352
30 8159013797277614108 4760973827227662090 352
31 3297264645688179556 4760973827227662090 352
32 11533841022557174413 4760973827227662090 352
33 7881009225317489937 12969120887479638174 1091
34 18240397436929284568 12969120887479638174 1091
35 9379260355043304920 12274243358220278312 1203
36 17426338910066206201 12274243358220278312 1203
37 14903452856864646137 12274243358220278312 1203
38 13747303346285202932 12274243358220278312 1203
39 6845236417472729582 12274243358220278312 1203
40 10412447934835411926 12274243358220278312 1203
41 7160424944002143440 742322610132807068 483
42 13750020894288887753 742322610132807068 483
43 4618552683651648223 742322610132807068 483
44 7737564170864886973 742322610132807068 483
45 18139537796907426160 742322610132807068 483
46 17361551052051276473 742322610132807068 483
47 17774877717891627117 742322610132807068 483
48 4889827894350103293 742322610132807068 483
49 8947604768535988671 742322610132807068 483
50 10004098216815873767 742322610132807068 483
51 8638991989206359312 742322610132807068 483
52 5022021078786974203 742322610132807068 483

In [4]:
ukhs.find_unikmers(seq)


b'ACAGTACAGTCAAGCGATACTCCGAGCTTAACCGGACAACGGTTCTCGCCTATCGCCCTTCACCTCAATTGGG'
Out[4]:
[(4287246436847025391, 850),
 (4287246436847025391, 850),
 (4287246436847025391, 850),
 (4287246436847025391, 850),
 (6756425901368628185, 990),
 (6756425901368628185, 990),
 (6756425901368628185, 990),
 (6756425901368628185, 990),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (26644528043774785, 3264),
 (3374902433127799277, 1075),
 (3374902433127799277, 1075),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (923109349927235793, 1331),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (4760973827227662090, 352),
 (12274243358220278312, 1203),
 (12274243358220278312, 1203),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483),
 (742322610132807068, 483)]

In [6]:
ukhs.set_cursor(seq)

In [7]:
seq


Out[7]:
'CGTGCAGTTCTAAATTCGAAGCCTGCAGGCTTGGTCGTGGGTTAAAGAATTTACTGTACACCAGCACATACGGCACCAGCTAATAAAAATACTTAAACAA'

In [2]:
class dBG:
    '''A basic de Bruijn Graph.
    '''

    def __init__(self, ksize, Sigma='ACGT', *args, **kwargs):
        self.store = set()
        self.tags = set()
        self.ksize = ksize
        self.Sigma = Sigma
        self.Sigma_set = set(self.Sigma)
        self.Sigma_list = list(self.Sigma)
        
    def kmers(self, sequence):
        '''Yields the k-mers in the given sequence.
        '''
        for i in range(len(sequence) - self.ksize + 1):
            yield sequence[i:i+self.ksize]

    def reset(self):
        '''Reset the dBG to an empty state.
        '''
        self.store.clear()

    def get(self, item):
        '''Return true if the item is in the dBG, false otherwise
        '''
        return item in self.store

    def add(self, item):
        '''Add the k-mer(s) from the given sequence to the dBG.
        '''
        if len(item) < self.ksize:
            raise ValueError(item)
        elif len(item) == self.ksize:
            self.store.add(item)
        else:
            self.store.update(self.kmers(item))
    
    def tag(self, item, dist=3):
        if len(item) < self.ksize:
            raise ValueError(item)
        count = 0
        n_tagged = 0
        for kmer in self.kmers(item):
            if kmer in self.tags:
                count = 0
            else:
                count += 1
                if count > dist:
                    self.tags.add(kmer)
                    count = 0
                    n_tagged += 1
        return n_tagged
    
    def remove(self, item):
        '''Remove the k-mer(s) from the given sequence from the dBG.
        '''
        if len(item) < self.ksize:
            raise ValueError(item)
        elif len(item) == self.ksize:
            self.store.remove(item)
        else:
            self.store.difference_update(self.kmers(item))
    
    def suffix(self, sequence):
        '''Returns the length k-1 suffix of the sequence.
        '''
        if len(sequence) < self.ksize:
            raise ValueError('Sequence too short')
        return sequence[-self.ksize+1:]
    
    def prefix(self, sequence):
        '''Returns the length k-1 prefix of the sequence.
        '''
        if len(sequence) < self.ksize:
            raise ValueError('Sequence too short')
        return sequence[:self.ksize-1]
    
    def left_neighbors(self, item):
        '''Yields the four left neighbors of the given k-mer.
        '''
        prefix = self.prefix(item)
        for b in self.Sigma:
            yield b + prefix
    
    def right_neighbors(self, item):
        '''Yields the four right neighbors of the given k-mer.
        '''
        suffix = self.suffix(item)
        for b in self.Sigma:
            yield suffix + b

    def left_degree(self, item):
        '''Yields the in-degree (left degree) of the given k-mer.
        '''
        return sum((self.get(neighbor) for neighbor in self.left_neighbors(item)))

    def right_degree(self, item):
        '''Yields the out-degree (right degree) of the given k-mer.
        '''
        return sum((self.get(neighbor) for neighbor in self.right_neighbors(item)))
    
    def degree(self, item):
        return self.left_degree(item), self.right_degree(item)
    
    def is_decision(self, kmer):
        '''Returns True if the given k-mer is a decision k-mer in the dBG:
        that is, if its in-degree or out-degree is greater than one.
        '''
        return self.left_degree(kmer) > 1 or self.right_degree(kmer) > 1

    def find_decisions(self, sequence):
        '''Yields the position and k-mer sequence of the decision k-mers in the 
        given sequence.
        '''
        for n, kmer in enumerate(self.kmers(sequence)):
            if self.is_decision(kmer):
                yield n, kmer

In [15]:
for kmer in kmers(seq, 7):
    print('(', hasher.hash(kmer), ', ', ukhs.query(hasher.hash(kmer)), ')', sep='')


(10973492759661448830, None)
(9508044623959221480, None)
(5551789466241464427, None)
(9981562942641867623, None)
(6910235179741095797, None)
(16173567458420033444, 773)
(9681429984473271322, None)
(12286883827231201492, None)
(10628437211489627735, None)
(16179131663506539345, None)
(11855042890701003513, None)
(462264805413977041, None)
(5361487379720942833, 1435)
(15167255132354652849, None)
(9767070260505360055, None)
(2271294049989203678, None)
(3429763133724022285, None)
(17028648904892568591, None)
(1970231056644343929, None)
(5345211148680906421, None)
(15199438622384311865, 21)
(7361969857450458133, None)
(17738795196983906787, None)
(9931439664741875613, None)
(15411137244630950085, None)
(16236571949337440305, None)
(17933022849944390058, None)
(15190505698020061313, None)
(17253953829566599353, None)
(4819930053698499718, 957)
(13816626555898921157, 155)
(13332147360646422133, None)
(4425900195240702665, None)
(13340665749868037620, None)
(12804676011767737231, None)
(11898217557306738913, 1758)
(17968778197695719894, None)
(6092996541620212556, 2619)
(9492807686967412797, None)
(11623510983791579290, None)
(9325400189813064019, None)
(11880919204116628038, None)
(7161673712181832232, None)
(53626167473015726, None)
(12328161675974042964, None)
(18178709115218868385, None)
(13707899354641478160, None)
(16176675171767972308, 2428)
(7680137566208037281, None)
(17751190970372276374, None)
(3083145406662860499, None)
(9165795826749625091, None)
(17218547609949330871, 2071)
(11194366683976824892, None)
(5998502094475439139, None)
(13181929551547245559, None)
(14693639319463020302, None)
(15615660206686486591, 2379)
(14837852380046874168, None)
(14154084896572728639, 2000)
(17607814655399468088, 1934)
(13190811586428727615, None)
(15960296688672119402, None)
(18181023002981922694, None)
(7248706085502253816, None)
(17530273200652098617, 2175)
(14259060301907516327, None)
(17171803341824355093, None)
(11109890741443728760, 1558)
(2652322357543686464, None)
(3827133968722949319, None)
(3855447806880653504, None)
(12175169244841402854, None)
(14837852380046874168, None)
(2933338114393199716, None)
(1718647070856965019, None)
(8073715770687889016, 1127)
(6014333372235564261, None)
(16167124664390961817, None)
(7390162086172464594, None)
(10567243520753328874, None)
(1147474059546209136, 2357)
(6442972878992250291, None)
(1783275578754881902, None)
(2008197580886072441, 655)
(5421294142873857717, None)
(4329076677671104354, None)
(15153649911872433356, None)
(16296954786638011082, None)
(9432673442155034310, 2268)
(3418037946617496360, None)
(2476921093017980951, None)
(1947723552378939744, None)
(2066882772635040869, None)

In [8]:
ukhs.find_unikmers(seq)


b'GGCTTGGTCGTGGGTTAAAGAATTTACTGTACACCAGCACATACGGCACCAGCTAATAAAAATACTTAAACAA'
Out[8]:
[(5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (5361487379720942833, 1435),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (4819930053698499718, 957),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (6092996541620212556, 2619),
 (15615660206686486591, 2379),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (14154084896572728639, 2000),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (11109890741443728760, 1558),
 (8073715770687889016, 1127),
 (8073715770687889016, 1127),
 (8073715770687889016, 1127),
 (8073715770687889016, 1127),
 (8073715770687889016, 1127),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357),
 (1147474059546209136, 2357)]

In [6]:
c_ukhs = ukhs.hashes

In [7]:
mph = bbhash.PyMPHF(c_ukhs, len(c_ukhs), 1, 2.0)

In [12]:
for kmer in kmers(seq, 7):
    h = hasher.hash(kmer)
    print(h, kmer, ukhs.query(h), h in c_ukhs, mph.lookup(h))


3966585810826196083 TCACCTG None False 1195
12132712385095546648 CACCTGT None False 2647
7690237574936954872 ACCTGTG None False 163
9158360649715908088 CCTGTGT None False 2034
5571541805904823269 CTGTGTT 1681 True 1681
11591423859377237507 TGTGTTG None False 2715
9782452822128919545 GTGTTGT 1219 True 1219
14731550741190813589 TGTTGTG 1250 True 1250
9832345464020448410 GTTGTGC None False 2180
6488094177847892111 TTGTGCT 1980 True 1980
11435990624582328251 TGTGCTA None False 2830
3313502993816634566 GTGCTAC None False 2009
1232448785995184182 TGCTACT None False 462
16450901114743926674 GCTACTT None False None
823832800960675651 CTACTTG None False 410
8311279440186057436 TACTTGC None False 2884
12235821944465034138 ACTTGCG 1137 True 1137
8137706075476292833 CTTGCGG None False 43
9536326726933558680 TTGCGGC None False 1179
5174287711837398291 TGCGGCG None False None
11461645855627522455 GCGGCGC None False None

In [13]:
ukhs.set_cursor(seq)

In [14]:
ukhs.unikmer


Out[14]:
5571541805904823269

In [15]:
ukhs.partition


Out[15]:
1681

In [16]:
seq


Out[16]:
'TCACCTGTGTTGTGCTACTTGCGGCGC'

In [17]:
ukhs.hashvalue


Out[17]:
13194817695400542713

In [45]:
hpp = ''
cpp = ''
for W in range(20, 210, 10):
    for K in range(7, 10):
        cpp += '    const std::vector<std::string> UKHSData::W{0}_K{1} = {{'.format(W, K)
        hpp += '    static const std::vector<std::string> W{0}_K{1};\n'.format(W, K)
        with open('../goetia/ukhs-data/res_{0}_{1}_4_0.txt'.format(K, W)) as fp:          
            for n, line in enumerate(fp):
                kmer = line.strip()
                if n != 0:
                    cpp += ', '
                cpp += '"{0}"'.format(kmer)
        cpp += '};\n'

In [40]:
print(hpp)


    static const std::vector<std::string> W20_K7;
    static const std::vector<std::string> W20_K8;
    static const std::vector<std::string> W20_K9;
    static const std::vector<std::string> W30_K7;
    static const std::vector<std::string> W30_K8;
    static const std::vector<std::string> W30_K9;
    static const std::vector<std::string> W40_K7;
    static const std::vector<std::string> W40_K8;
    static const std::vector<std::string> W40_K9;
    static const std::vector<std::string> W50_K7;
    static const std::vector<std::string> W50_K8;
    static const std::vector<std::string> W50_K9;
    static const std::vector<std::string> W60_K7;
    static const std::vector<std::string> W60_K8;
    static const std::vector<std::string> W60_K9;
    static const std::vector<std::string> W70_K7;
    static const std::vector<std::string> W70_K8;
    static const std::vector<std::string> W70_K9;
    static const std::vector<std::string> W80_K7;
    static const std::vector<std::string> W80_K8;
    static const std::vector<std::string> W80_K9;
    static const std::vector<std::string> W90_K7;
    static const std::vector<std::string> W90_K8;
    static const std::vector<std::string> W90_K9;
    static const std::vector<std::string> W100_K7;
    static const std::vector<std::string> W100_K8;
    static const std::vector<std::string> W100_K9;
    static const std::vector<std::string> W110_K7;
    static const std::vector<std::string> W110_K8;
    static const std::vector<std::string> W110_K9;
    static const std::vector<std::string> W120_K7;
    static const std::vector<std::string> W120_K8;
    static const std::vector<std::string> W120_K9;
    static const std::vector<std::string> W130_K7;
    static const std::vector<std::string> W130_K8;
    static const std::vector<std::string> W130_K9;
    static const std::vector<std::string> W140_K7;
    static const std::vector<std::string> W140_K8;
    static const std::vector<std::string> W140_K9;
    static const std::vector<std::string> W150_K7;
    static const std::vector<std::string> W150_K8;
    static const std::vector<std::string> W150_K9;
    static const std::vector<std::string> W160_K7;
    static const std::vector<std::string> W160_K8;
    static const std::vector<std::string> W160_K9;
    static const std::vector<std::string> W170_K7;
    static const std::vector<std::string> W170_K8;
    static const std::vector<std::string> W170_K9;
    static const std::vector<std::string> W180_K7;
    static const std::vector<std::string> W180_K8;
    static const std::vector<std::string> W180_K9;
    static const std::vector<std::string> W190_K7;
    static const std::vector<std::string> W190_K8;
    static const std::vector<std::string> W190_K9;
    static const std::vector<std::string> W200_K7;
    static const std::vector<std::string> W200_K8;
    static const std::vector<std::string> W200_K9;


In [36]:
func = 'const std::vector<std::string>& get_ukhs(int W, int K) {\n    if '
for W in range(20, 210, 10):
    for K in range(7, 10):
        func += '(W == {W} && K == {K}) {{\n        return W{W}_K{K};\n    }} else if '.format(W=W, K=K)
func += '\n}'

In [37]:
print(func)


const std::vector<std::string>& get_ukhs(int W, int K) {
    if (W == 20 && K == 7) {
        return W20_K7;
    } else if (W == 20 && K == 8) {
        return W20_K8;
    } else if (W == 20 && K == 9) {
        return W20_K9;
    } else if (W == 30 && K == 7) {
        return W30_K7;
    } else if (W == 30 && K == 8) {
        return W30_K8;
    } else if (W == 30 && K == 9) {
        return W30_K9;
    } else if (W == 40 && K == 7) {
        return W40_K7;
    } else if (W == 40 && K == 8) {
        return W40_K8;
    } else if (W == 40 && K == 9) {
        return W40_K9;
    } else if (W == 50 && K == 7) {
        return W50_K7;
    } else if (W == 50 && K == 8) {
        return W50_K8;
    } else if (W == 50 && K == 9) {
        return W50_K9;
    } else if (W == 60 && K == 7) {
        return W60_K7;
    } else if (W == 60 && K == 8) {
        return W60_K8;
    } else if (W == 60 && K == 9) {
        return W60_K9;
    } else if (W == 70 && K == 7) {
        return W70_K7;
    } else if (W == 70 && K == 8) {
        return W70_K8;
    } else if (W == 70 && K == 9) {
        return W70_K9;
    } else if (W == 80 && K == 7) {
        return W80_K7;
    } else if (W == 80 && K == 8) {
        return W80_K8;
    } else if (W == 80 && K == 9) {
        return W80_K9;
    } else if (W == 90 && K == 7) {
        return W90_K7;
    } else if (W == 90 && K == 8) {
        return W90_K8;
    } else if (W == 90 && K == 9) {
        return W90_K9;
    } else if (W == 100 && K == 7) {
        return W100_K7;
    } else if (W == 100 && K == 8) {
        return W100_K8;
    } else if (W == 100 && K == 9) {
        return W100_K9;
    } else if (W == 110 && K == 7) {
        return W110_K7;
    } else if (W == 110 && K == 8) {
        return W110_K8;
    } else if (W == 110 && K == 9) {
        return W110_K9;
    } else if (W == 120 && K == 7) {
        return W120_K7;
    } else if (W == 120 && K == 8) {
        return W120_K8;
    } else if (W == 120 && K == 9) {
        return W120_K9;
    } else if (W == 130 && K == 7) {
        return W130_K7;
    } else if (W == 130 && K == 8) {
        return W130_K8;
    } else if (W == 130 && K == 9) {
        return W130_K9;
    } else if (W == 140 && K == 7) {
        return W140_K7;
    } else if (W == 140 && K == 8) {
        return W140_K8;
    } else if (W == 140 && K == 9) {
        return W140_K9;
    } else if (W == 150 && K == 7) {
        return W150_K7;
    } else if (W == 150 && K == 8) {
        return W150_K8;
    } else if (W == 150 && K == 9) {
        return W150_K9;
    } else if (W == 160 && K == 7) {
        return W160_K7;
    } else if (W == 160 && K == 8) {
        return W160_K8;
    } else if (W == 160 && K == 9) {
        return W160_K9;
    } else if (W == 170 && K == 7) {
        return W170_K7;
    } else if (W == 170 && K == 8) {
        return W170_K8;
    } else if (W == 170 && K == 9) {
        return W170_K9;
    } else if (W == 180 && K == 7) {
        return W180_K7;
    } else if (W == 180 && K == 8) {
        return W180_K8;
    } else if (W == 180 && K == 9) {
        return W180_K9;
    } else if (W == 190 && K == 7) {
        return W190_K7;
    } else if (W == 190 && K == 8) {
        return W190_K8;
    } else if (W == 190 && K == 9) {
        return W190_K9;
    } else if (W == 200 && K == 7) {
        return W200_K7;
    } else if (W == 200 && K == 8) {
        return W200_K8;
    } else if (W == 200 && K == 9) {
        return W200_K9;
    } else if 
}

In [46]:
with open('defs.cc', 'w') as fp:
    print(cpp, file=fp)

In [ ]: