Aim

Given a large set of sequences or graphs with ordered vertices find small vertex ordered subsequences that are most discriminative for the set.

Steps:

  • devise a negative set
  • learn a discriminative model
  • annotate importance on vertices
  • extract max subarrays
  • cluster them
    • use fast EDeN string kernel
    • custering algorithm

Output:

  1. all sequence motives in each cluster
  2. all initial sequences with motif location (begin,end) and cluster id (build regex from all seqs in cluster and run a find iterator)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [3]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGU', motives=None, motif_length=6, sequence_length=100, n_sequences=1000, n_motives=2, p=0.2):
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
        
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences / n_motives

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            seqs.append(('>ID%d'%counter,seq))
            counter += 1
    return motives, seqs

In [4]:
from eden.motif import SequenceMotif
help(SequenceMotif)


Help on class SequenceMotif in module eden.motif:

class SequenceMotif(__builtin__.object)
 |  Methods defined here:
 |  
 |  __init__(self, min_subarray_size=7, max_subarray_size=10, min_motif_count=1, min_cluster_size=1, training_size=None, negative_ratio=2, shuffle_order=2, n_iter_search=1, complexity=4, nbits=20, clustering_algorithm=None, n_jobs=4, n_blocks=8, block_size=None, pre_processor_n_jobs=4, pre_processor_n_blocks=8, pre_processor_block_size=None, random_state=1)
 |  
 |  fit(self, seqs, neg_seqs=None)
 |      Builds a discriminative estimator.
 |      Identifies the maximal subarrays in the data.
 |      Clusters them with the clustering algorithm provided in the initialization phase.
 |      For each cluster builds a fast sequence search model (Aho Corasick data structure).
 |  
 |  fit_predict(self, seqs, return_list=False)
 |  
 |  fit_transform(self, seqs, return_match=False)
 |  
 |  load(self, obj)
 |  
 |  predict(self, seqs, return_list=False)
 |      Returns for each instance a list with the cluster ids that have a hit
 |      if  return_list=False then just return 1 if there is at least one hit from one cluster.
 |  
 |  save(self, model_name)
 |  
 |  transform(self, seqs, return_match=False)
 |      Transform an instance to a dense vector with features as cluster ID and entries 0/1 if a motif is found,
 |      if 'return_match' argument is True, then write a pair with (start position,end position)  in the entry
 |      instead of 0/1
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)

Experimental Setup


In [5]:
#setup parameters
alphabet='ACGU'
motives=['AAAAAAAAAA','CCCCCCCCCC','GGGGGGGGGG','UUUUUUUUUU']
sequence_length=300
n_sequences=40
p=0.3

#make dataset
motives, seqs = make_artificial_dataset(alphabet=alphabet,motives=motives,sequence_length=sequence_length,n_sequences=n_sequences,p=p)

#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives: 
    print
    print motif,
    for i in range(9):
        print perturb(motif,alphabet_list,p=p),


Motives and sample of their perturbed variants:

AAAAAAAAAA AAAAAACCAA AAGGAGACAA AACUUAAAAA AAAAAACAAA AAAAAAAAAA CCAAAAAAUA AAACAACAAA AGUAAACAGU AAGGAAUAAA
CCCCCCCCCC CCGCCUCACC CCCAGGCCCC CCCUCCCCCC CCCACCCCCC CCGGACCCCA CAUCCCCGAC UCCCCCCCAC UCCCCGACCC UCCGCCCCCC
GGGGGGGGGG GAGCAGGUCU GGGGCGGUGG GGGGGGGUUG GGGAGGAGGG GGGGAGGGCG CGGGGGGGGG UGUGGGGUGG CGGGCGGGUG GUGGGGGGGG
UUUUUUUUUU AGUUUAUAUU UUUUUUAUUU UUUUGUUUAG UUUGUUUUUC UUUUUCUACU UUUCUUUUUU CUUUUUUUUU UUAUAUCUUU UUUUUUUUUU

In [6]:
#save to file
fname='artificial_motif_search_dataset.fa'
with open(fname,'w') as f:
    for header,seq in seqs: 
        f.write(header+"\n")
        f.write(seq+"\n")

#save explicit negative sequences
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = list(seq_to_seq(seqs, modifier=shuffle_modifier, times=2, order=2))
fname='artificial_motif_search_dataset_negatives.fa'
with open(fname,'w') as f:
    for header,seq in neg_seqs: 
        f.write(header+"\n")
        f.write(seq+"\n")

In [1]:
%%time
from sklearn.cluster import MiniBatchKMeans
ca = MiniBatchKMeans(n_clusters=4)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(complexity=2, nbits=14, clustering_algorithm=ca, min_motif_count=1, n_jobs=2)
seqmot.fit(seqs, neg_seqs)
seqmot.save('seqmot')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-681fc90bf0a1> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"from sklearn.cluster import MiniBatchKMeans\nca = MiniBatchKMeans(n_clusters=4)\n\nfrom eden.motif import SequenceMotif\nseqmot = SequenceMotif(complexity=2, nbits=14, clustering_algorithm=ca, min_motif_count=1, n_jobs=2)\nseqmot.fit(seqs, neg_seqs)\nseqmot.save('seqmot')")

/Users/costa/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2262             magic_arg_s = self.var_expand(line, stack_depth)
   2263             with self.builtin_trap:
-> 2264                 result = fn(magic_arg_s, cell)
   2265             return result
   2266 

/Users/costa/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/Users/costa/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Users/costa/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

NameError: name 'seqs' is not defined

In [8]:
for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count


0
AAAAAAAAAA 4
AAAAAAAAAG 2
GCCAUAAAA 1
GAAAAGAUAC 1
CUUAAAAAAG 1
AGAAAAAAAG 1
AAUAAAAAA 1
1
GUAGAUGCU 1
GUACGAGCC 1
GGGGGGGGGG 1
CCCCCCCUC 1
CCCCCCCCCC 1
CAUUGCUAAG 1
AAUACCGAGC 1
2
UUUUUUUUUA 1
UAUUUUUUU 1
CUAUUUAUC 1
3
UUAUUAAUA 1
UUAUUAAAUU 1
UAAUAAUUG 1
GGGGAUUAA 1
GAAUUAAAUU 1
CGAUAAUUGC 1
CCGAUUAUAU 1
CAAUUAAUU 1
AUUAAAUAG 1
AAUUACUUAG 1

In [9]:
from eden.motif import SequenceMotif
seqmot2 = SequenceMotif()
seqmot2.load('seqmot')

In [10]:
predictions=seqmot2.predict(seqs, return_list=False)
for p in predictions: print p


1
1
1
0
3
0
0
0
1
0
0
0
1
1
0
1
1
2
1
1
1
1
0
0
0
1
0
0
0
0
0
0
1
1
1
1
1
1
2
1

In [12]:
from eden.motif import SequenceMotif
seqmot2 = SequenceMotif()
seqmot2.load('seqmot')

predictions=seqmot2.predict(seqs, return_list=True)
for p in predictions: print p


[0]
[3]
[3]
[]
[3, 0, 1]
[]
[]
[]
[0]
[]
[]
[]
[0]
[3]
[]
[3]
[0]
[3, 2]
[1]
[2]
[0]
[1]
[]
[]
[]
[1]
[]
[]
[]
[]
[]
[]
[0]
[3]
[1]
[1]
[0]
[0]
[1, 3]
[2]

In [13]:
predictions=seqmot2.transform(seqs, return_match=False)
for p in predictions: print p


[1, 0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 1]
[0, 0, 0, 0]
[1, 1, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 1]
[0, 0, 0, 0]
[0, 0, 0, 1]
[1, 0, 0, 0]
[0, 0, 1, 1]
[0, 1, 0, 0]
[0, 0, 1, 0]
[1, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 1, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 0]
[0, 0, 0, 1]
[0, 1, 0, 0]
[0, 1, 0, 0]
[1, 0, 0, 0]
[1, 0, 0, 0]
[0, 1, 0, 1]
[0, 0, 1, 0]

In [38]:
predictions=seqmot2.transform(seqs, return_match=True)
for p in predictions: print p


[[], [(141, 151), (143, 152), (144, 153)], [], [(135, 144)]]
[[], [], [], []]
[[(145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], [], [(195, 204)], []]
[[(188, 197)], [], [], [(0, 9)]]
[[], [], [], []]
[[(72, 81)], [], [], [(145, 155), (258, 268)]]
[[(146, 156)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(145, 154), (145, 155), (146, 155), (176, 185)], [], [], [(125, 135)]]
[[], [], [], []]
[[], [(144, 153), (145, 154), (145, 155), (146, 155), (147, 156)], [], []]
[[], [], [], [(146, 155)]]
[[], [], [], []]
[[], [], [], []]
[[], [(145, 154)], [], [(91, 100), (250, 260)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [(147, 157)], [(85, 94)]]
[[], [], [(44, 53)], []]
[[], [], [], [(143, 152), (144, 154), (146, 155)]]
[[], [], [], []]
[[], [], [(146, 156), (147, 156), (147, 157)], []]
[[], [(141, 151), (142, 151)], [], []]
[[], [], [], [(144, 154), (145, 154), (147, 156), (147, 157)]]
[[], [], [], []]
[[], [(53, 63)], [], []]
[[], [], [], []]
[[(7, 17), (29, 39), (183, 193)], [], [], [(247, 256)]]
[[], [], [], []]
[[], [], [(273, 282)], [(42, 52)]]
[[(94, 104)], [], [], []]
[[], [], [], [(144, 153)]]
[[], [], [(191, 200), (194, 203)], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(27, 37)], [], [(145, 155), (146, 155)], []]
[[], [(143, 152)], [], []]
[[], [], [], [(144, 153)]]
[[(194, 204)], [], [], []]
[[], [], [(144, 154), (145, 154)], []]
[[], [], [], []]
[[(138, 148)], [], [], [(143, 152), (144, 153), (144, 154), (145, 154), (145, 155)]]
[[], [], [], []]
[[], [], [], []]
[[], [(78, 87)], [], []]
[[], [], [], [(148, 157)]]
[[(145, 154)], [], [], []]
[[], [], [(146, 155)], []]
[[], [(142, 152), (144, 154)], [], []]
[[], [], [], [(164, 173)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(145, 155)]]
[[], [(230, 239)], [], []]
[[], [], [(142, 152), (145, 155), (146, 155)], []]
[[(193, 203)], [(143, 152), (143, 153), (144, 153), (145, 155)], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(18, 27), (145, 155)], []]
[[], [(143, 152)], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(146, 156), (148, 158)], [(129, 138)]]
[[], [(145, 155)], [], []]
[[], [(81, 91)], [], []]
[[(143, 152), (144, 153)], [], [(67, 76)], []]
[[], [], [], []]
[[], [(143, 152), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], [], []]
[[], [], [], [(50, 59), (143, 153)]]
[[(145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [], [], []]
[[], [], [], []]
[[], [(145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], [], []]
[[], [], [], []]
[[(145, 155), (146, 155)], [], [], []]
[[], [], [(144, 154), (145, 154)], []]
[[], [(145, 154), (146, 156)], [(117, 126), (225, 234)], []]
[[], [], [], []]
[[(145, 154), (145, 155)], [], [], []]
[[], [], [(142, 151), (142, 152), (143, 152), (143, 153), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], []]
[[], [], [], []]
[[], [], [], []]
[[(65, 74)], [], [], []]
[[], [], [], [(157, 166)]]
[[], [], [], []]
[[(77, 86)], [], [], []]
[[(145, 155)], [], [], []]
[[], [], [(144, 153), (144, 154), (156, 165)], []]
[[], [], [], []]
[[], [], [], [(145, 155), (190, 199), (191, 200)]]
[[], [], [], []]
[[], [], [], [(62, 71)]]
[[], [(142, 151)], [], []]
[[], [], [], [(145, 154)]]
[[(144, 154), (210, 219)], [], [(117, 126)], [(188, 198)]]
[[], [], [], []]
[[], [(145, 154), (145, 155), (146, 155)], [], []]
[[], [], [], [(146, 155), (146, 156), (147, 157), (148, 157)]]
[[], [], [], []]
[[(27, 36)], [], [(146, 155), (194, 204)], []]
[[], [(146, 155), (147, 156)], [(287, 296)], []]
[[], [], [], [(145, 155), (146, 155)]]
[[(145, 155)], [], [], []]
[[], [], [(146, 156)], []]
[[(216, 225)], [(144, 154)], [], []]
[[], [], [], []]
[[(147, 156)], [], [(99, 108)], []]
[[], [], [(145, 155), (147, 157)], []]
[[], [], [], []]
[[], [], [], []]
[[(144, 153)], [], [], []]
[[], [], [], []]
[[], [(147, 156)], [], []]
[[], [], [], [(146, 155), (146, 156), (147, 156)]]
[[], [], [], []]
[[], [], [(143, 152), (143, 153), (144, 153), (144, 154), (145, 154)], []]
[[], [], [(166, 175)], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(147, 156)], []]
[[], [], [], []]
[[], [], [], [(144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (146, 156), (147, 157), (148, 157)]]
[[], [(13, 22)], [], []]
[[], [], [(144, 154), (145, 155)], [(244, 253), (244, 254)]]
[[], [(144, 154)], [], []]
[[], [(122, 131)], [], []]
[[], [], [], []]
[[], [], [(146, 155)], []]
[[], [(141, 151), (143, 152), (143, 153), (144, 153), (145, 154)], [], []]
[[], [], [], [(145, 155)]]
[[(118, 127)], [], [], []]
[[], [], [(146, 156)], []]
[[], [(146, 155)], [], []]
[[], [], [], [(145, 155)]]
[[(145, 154), (145, 155), (146, 155)], [], [], [(80, 90)]]
[[], [], [], []]
[[], [(145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [], []]
[[], [], [], []]
[[(145, 155), (149, 158)], [], [], []]
[[], [], [(145, 154), (145, 155), (146, 155)], [(74, 83)]]
[[], [], [(274, 283)], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(141, 150)]]
[[(19, 28), (143, 152)], [], [], []]
[[], [(164, 173), (165, 174)], [], []]
[[], [], [], []]
[[], [], [], [(144, 154)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(148, 158)]]
[[], [], [], []]
[[(239, 249)], [], [(146, 156), (147, 156), (171, 181)], []]
[[], [], [], [(78, 87)]]
[[], [], [], [(144, 153), (145, 154)]]
[[], [], [], []]
[[], [], [(145, 155)], []]
[[], [], [(163, 173)], [(191, 200)]]
[[], [], [], [(144, 153), (145, 154), (145, 155), (146, 155), (147, 156)]]
[[], [], [], []]
[[], [], [], []]
[[], [(145, 155)], [], []]
[[], [], [], []]
[[(145, 154)], [], [], []]
[[], [], [(145, 154), (145, 155), (146, 155)], []]
[[], [(145, 155)], [], []]
[[], [], [], [(144, 153), (145, 154)]]
[[(145, 155), (146, 155), (147, 156), (148, 157)], [], [], []]
[[], [], [(81, 90), (125, 135)], []]
[[], [], [], []]
[[], [], [], [(145, 154), (147, 157), (148, 157), (149, 158)]]
[[(144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (147, 157), (148, 157)], [], [], []]
[[], [], [(146, 156)], [(217, 227)]]
[[], [], [], [(134, 143), (215, 225)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [(143, 152), (144, 154), (146, 155)], []]
[[], [(142, 152), (144, 153)], [], []]
[[], [], [], []]
[[(145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], [], [], []]
[[], [], [(29, 39), (30, 39), (144, 153)], []]
[[], [], [], []]
[[], [], [], [(145, 155)]]
[[(70, 79)], [], [], []]
[[], [], [(144, 154), (146, 155)], []]
[[], [(143, 152), (143, 153), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (147, 156)], [], []]
[[], [], [], [(146, 155)]]
[[(145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], [], [], [(101, 110)]]
[[], [], [], []]
[[(272, 281)], [(142, 151), (144, 153)], [], []]
[[], [], [], []]
[[(25, 34), (145, 155)], [], [], []]
[[], [], [(145, 154), (145, 155)], []]
[[], [], [], []]
[[], [], [(268, 277)], [(146, 155), (147, 156), (147, 157)]]
[[(16, 25)], [], [], [(102, 111)]]
[[(50, 59)], [], [], []]
[[(214, 224)], [], [], []]
[[], [(210, 219)], [], []]
[[], [], [], []]
[[], [], [], [(187, 196)]]
[[], [], [], []]
[[], [], [], [(7, 16)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(189, 199)], [(148, 158)]]
[[(147, 156)], [], [], [(265, 274), (265, 275)]]
[[], [], [(160, 170)], []]
[[], [], [], []]
[[], [], [], [(146, 155)]]
[[], [], [], []]
[[], [], [(38, 48)], []]
[[], [], [(55, 64)], []]
[[(279, 288)], [], [], []]
[[], [], [], []]
[[], [], [(145, 154), (145, 155), (146, 155)], []]
[[], [], [], [(97, 107)]]
[[], [], [], []]
[[(145, 154), (145, 155), (146, 155), (147, 156)], [], [], []]
[[], [], [], []]
[[(68, 77)], [(143, 152), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], [], []]
[[], [], [], [(148, 157), (148, 158), (226, 235), (248, 258)]]
[[], [], [], []]
[[], [], [], []]
[[], [(146, 155), (146, 156), (147, 156), (148, 157)], [], [(20, 30)]]
[[(40, 49)], [], [], [(146, 155), (146, 156), (147, 156), (148, 157)]]
[[], [], [], []]
[[], [], [(143, 152), (143, 153), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], []]
[[], [(143, 152)], [], []]
[[], [], [], [(144, 153), (145, 154), (145, 155), (146, 156), (147, 156), (172, 181)]]
[[(145, 155), (146, 155)], [], [], []]
[[], [], [(144, 154)], []]
[[(236, 245)], [(144, 153), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [], []]
[[], [], [], [(146, 155)]]
[[(144, 153)], [], [], []]
[[], [], [(144, 153), (144, 154), (146, 155)], []]
[[], [(148, 158)], [], []]
[[], [], [], [(146, 156), (237, 247)]]
[[(279, 288)], [], [], []]
[[], [], [(145, 155)], []]
[[], [(146, 156)], [], []]
[[], [], [], [(103, 112)]]
[[], [], [], []]
[[], [], [(143, 153), (144, 153), (144, 154), (145, 154)], []]
[[], [], [], []]
[[], [], [], [(145, 154), (146, 156), (150, 160)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(147, 156), (193, 202)], [], [], []]
[[], [], [(145, 154)], []]
[[], [], [], []]
[[], [], [], [(143, 153), (145, 154), (147, 156), (148, 157)]]
[[], [], [], []]
[[], [], [(139, 148), (139, 149), (141, 150), (141, 151), (142, 151), (142, 152), (143, 152)], []]
[[], [(145, 155), (146, 156), (147, 156)], [], []]
[[(24, 33)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(282, 291)], [], [], [(144, 153), (144, 154), (145, 154), (146, 155)]]
[[], [], [], []]
[[], [], [(145, 155)], []]
[[], [], [], []]
[[], [], [], []]
[[(145, 155), (146, 155), (146, 156), (147, 156), (147, 157), (148, 157)], [], [], []]
[[], [], [(144, 153)], []]
[[], [], [], []]
[[], [], [], [(146, 155)]]
[[], [], [], []]
[[], [], [(145, 155)], []]
[[], [], [], []]
[[], [], [], [(144, 154)]]
[[(144, 154), (145, 155)], [], [], []]
[[], [], [(185, 194)], []]
[[], [], [], []]
[[], [], [], []]
[[(146, 155), (146, 156), (147, 156), (147, 157), (148, 157), (148, 158), (149, 158), (149, 159), (150, 159), (152, 161)], [], [], []]
[[(124, 134)], [], [(99, 108)], []]
[[], [(144, 153), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [], []]
[[], [], [], [(146, 155), (147, 156), (149, 159)]]
[[(144, 154)], [], [], []]
[[], [], [], []]
[[], [(145, 154)], [], []]
[[], [], [], []]
[[], [(14, 24), (19, 28)], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(145, 155)]]
[[], [], [], [(254, 264)]]
[[], [], [(145, 155)], []]
[[], [], [(234, 244)], []]
[[], [], [], []]
[[(144, 153), (144, 154)], [], [], []]
[[], [(53, 62)], [(144, 154), (145, 154)], []]
[[], [(145, 155)], [], []]
[[], [], [], [(279, 288)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(249, 258)]]
[[], [], [], [(146, 155), (146, 156), (147, 157), (148, 157), (149, 158)]]
[[], [], [], []]
[[], [], [(146, 155)], []]
[[], [], [], []]
[[], [], [], [(145, 154), (146, 155)]]
[[(143, 153), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [], [], []]
[[], [], [(145, 154), (145, 155), (241, 251)], []]
[[], [(144, 153), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(145, 154)], []]
[[], [(147, 156)], [], [(227, 236)]]
[[], [], [], []]
[[(144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)], [(276, 286)], [(182, 191)], []]
[[(227, 237)], [], [], []]
[[], [], [], [(104, 113)]]
[[], [], [], [(144, 153), (144, 154), (145, 154), (145, 155), (146, 155)]]
[[(145, 155)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(68, 77)]]
[[], [], [], []]
[[], [], [(145, 155), (146, 155), (146, 156)], []]
[[], [(150, 159)], [], []]
[[], [], [], [(144, 153), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156), (148, 157)]]
[[], [], [], []]
[[], [], [(144, 153), (145, 155)], []]
[[], [(145, 155)], [], []]
[[], [], [], [(145, 155), (146, 155)]]
[[(143, 152), (146, 155), (146, 156), (147, 156), (148, 157)], [], [], []]
[[], [], [(149, 159)], []]
[[], [(145, 155), (146, 155)], [], []]
[[], [], [], [(143, 152), (143, 153)]]
[[(144, 154), (145, 155)], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[(117, 126)], [], [], [(132, 141)]]
[[], [], [(144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], []]
[[], [], [], []]
[[], [], [], []]
[[(145, 154)], [], [], []]
[[], [], [(149, 159), (150, 160)], []]
[[(268, 278)], [(145, 155)], [], []]
[[], [], [], [(147, 157)]]
[[(145, 154), (145, 155)], [], [], []]
[[], [], [(145, 155)], []]
[[], [(144, 153), (144, 154), (145, 154), (145, 155), (146, 155)], [], []]
[[], [], [], [(148, 157)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], [(198, 207), (198, 208)]]
[[], [], [], []]
[[(146, 156)], [], [], []]
[[], [], [(145, 155)], []]
[[], [], [], []]
[[], [], [], []]
[[(118, 128)], [], [], [(179, 188)]]
[[], [], [], []]
[[], [(142, 151)], [], []]
[[], [], [], [(144, 153)]]
[[(143, 153), (148, 157)], [], [], []]
[[], [], [(150, 160)], []]
[[], [], [], []]
[[(243, 252)], [], [], [(143, 152), (143, 153), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (147, 156), (147, 157)]]
[[], [], [], []]
[[], [], [], []]
[[], [(144, 154), (145, 155), (146, 155), (148, 157)], [], []]
[[(282, 292)], [], [], [(244, 253)]]
[[], [], [], []]
[[], [], [(144, 154)], []]
[[], [(144, 154)], [], [(46, 56)]]
[[], [], [], [(133, 143)]]
[[(144, 153), (144, 154)], [], [], []]
[[], [], [], []]
[[], [(145, 154)], [], []]
[[], [], [], [(144, 154), (168, 178)]]
[[(145, 154)], [], [], []]
[[], [], [(143, 152), (143, 153), (144, 154), (145, 154)], []]
[[], [(142, 152), (144, 153), (144, 154), (145, 154), (145, 155), (146, 155), (147, 156)], [], []]
[[(167, 176)], [], [(30, 39)], [(144, 153), (145, 154), (248, 257)]]
[[(145, 155)], [], [], []]
[[], [], [], [(189, 199)]]
[[], [], [], []]
[[], [], [], []]
[[], [], [], []]
[[], [], [(144, 154), (145, 154), (145, 155), (146, 155), (146, 156), (147, 156)], []]
[[], [], [], [(133, 142)]]
[[], [], [], [(144, 153), (144, 154)]]
[[], [], [], []]
[[], [], [(145, 154), (145, 155), (146, 156), (147, 156)], []]

In [20]:
%%time
from sklearn.cluster import Birch
ca = Birch(threshold=0.1, n_clusters=4, branching_factor=50)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(clustering_algorithm=ca, min_motif_count=2)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count


Positive data: Instances: 40 ; Features: 1048577 with an avg of 5132 features per instance
Negative data: Instances: 80 ; Features: 1048577 with an avg of 5139 features per instance
Elapsed time: 5.4 secs
model induction: 40 positive instances 5 s
motives extraction: 38 motives 2 s
motives clustering: 4 clusters 2 s
after filtering: 1 motives 1 clusters 0 s
motif model construction: 0 s
updated motif counts: 0 s
0
GGGGGGGGGG 2
CPU times: user 6.18 s, sys: 1.5 s, total: 7.68 s
Wall time: 11.1 s

In [21]:
%%time
from sklearn.cluster import DBSCAN
ca = DBSCAN(eps=0.1, min_samples=3)

from eden.motif import SequenceMotif
seqmot = SequenceMotif(clustering_algorithm=ca, min_motif_count=2)
seqmot.fit(seqs)

for cluster_id in seqmot.motives_db:
    print cluster_id
    for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
        print motif, count


Positive data: Instances: 40 ; Features: 1048577 with an avg of 5132 features per instance
Negative data: Instances: 80 ; Features: 1048577 with an avg of 5139 features per instance
Elapsed time: 5.7 secs
model induction: 40 positive instances 6 s
motives extraction: 30 motives 2 s
motives clustering: 1 clusters 0 s
after filtering: 0 motives 0 clusters 0 s
motif model construction: 0 s
updated motif counts: 0 s
CPU times: user 4.11 s, sys: 1.4 s, total: 5.51 s
Wall time: 8.79 s

In [ ]: