Usage example for SMoD wrapper


In [4]:
%matplotlib inline
from smod_wrapper import SMoDWrapper
from utilities import Weblogo

In [5]:
%load_ext autoreload
%autoreload 2

from eden.util import configure_logging
import logging
logger = logging.getLogger()
configure_logging(logger,verbosity=2)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [ ]:


In [6]:
train = [
    ('ce1cg', 
     'TAATGTTTGTGCTGGTTTTTGTGGCATCGGGCGAGAATAGCGCGTGGTGTGAAAGACTGTTTTTTTGATCGTTTTCACAAAAATGGAAGTCCACAGTCTTGACAG'),
    ('ara', 
     'GACAAAAACGCGTAACAAAAGTGTCTATAATCACGGCAGAAAAGTCCACATTGATTATTTGCACGGCGTCACACTTTGCTATGCCATAGCATTTTTATCCATAAG'),
    ('bglr1', 
     'ACAAATCCCAATAACTTAATTATTGGGATTTGTTATATATAACTTTATAAATTCCTAAAATTACACAAAGTTAATAACTGTGAGCATGGTCATATTTTTATCAAT'),
    ('crp', 
     'CACAAAGCGAAAGCTATGCTAAAACAGTCAGGATGCTACAGTAATACATTGATGTACTGCATGTATGCAAAGGACGTCACATTACCGTGCAGTACAGTTGATAGC'),
    ('cya', 
     'ACGGTGCTACACTTGTATGTAGCGCATCTTTCTTTACGGTCAATCAGCAAGGTGTTAAATTGATCACGTTTTAGACCATTTTTTCGTCGTGAAACTAAAAAAACC'),
    ('deop2', 
     'AGTGAATTATTTGAACCAGATCGCATTACAGTGATGCAAACTTGTAAGTAGATTTCCTTAATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATA'),
    ('gale', 
     'GCGCATAAAAAACGGCTAAATTCTTGTGTAAACGATTCCACTAATTTATTCCATGTCACACTTTTCGCATCTTTGTTATGCTATGGTTATTTCATACCATAAGCC'),
    ('ilv', 
     'GCTCCGGCGGGGTTTTTTGTTATCTGCAATTCAGTACAAAACGTGATCAACCCCTCAATTTTCCCTTTGCTGAAAAATTTTCCATTGTCTCCCCTGTAAAGCTGT'),
    ('lac', 
     'AACGCAATTAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGGAATTGTGAGCGGATAACAATTTCAC'),
    ('male', 
     'ACATTACCGCCAATTCTGTAACAGAGATCACACAAAGCGACGGTGGGGCGTAGGGGCAAGGAGGATGGAAAGAGGTTGCCGTATAAAGAAACTAGAGTCCGTTTA'),
    ('malk', 
     'GGAGGAGGCGGGAGGATGAGAACACGGCTTCTGTGAACTAAACCGAGGTCATGTAAGGAATTTCGTGATGTTGCTTGCAAAAATCGTGGCGATTTTATGTGCGCA'),
    ('malt', 
     'GATCAGCGTCGTTTTAGGTGAGTTGTTAATAAAGATTTGGAATTGTGACACAGTGCAAATTCAGACACATAAAAAAACGTCATCGCTTGCATTAGAAAGGTTTCT'),
    ('ompa', 
     'GCTGACAAAAAAGATTAAACATACCTTATACAAGACTTTTTTTTCATATGCCTGACGGAGTTCACACTTGTAAGTTTTCAACTACGTTGTAGACTTTACATCGCC'),
    ('tnaa', 
     'TTTTTTAAACATTAAAATTCTTACGTAATTTATAATCTTTAAAAAAAGCATTTAATATTGCTCCCCGAACGATTGTGATTCGATTCACATTTAAACAATTTCAGA'),
    ('uxu1', 
     'CCCATGAGAGTGAAATTGTTGTGATGTGGTTAACCCAATTAGAATTCGGGATTGACATGTCTTACCAAAAGGTAGAACTTATACGCCATCTCATCCGATGCAAGC'),
    ('pbr322', 
     'CTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCTC'),
    ('trn9cat', 
     'CTGTGACGGAAGATCACTTCGCAGAATAAATAAATCCTGGTGTCCCTGTTGATACCGGGAAGCCCTGGGCCAACTTTTGGCGAAAATGAGACGTTGATCGGCACG'),
    ('tdc', 
     'GATTTTTATACTTTAACTTGTTGATATTTAAAGGTATTTAATTGTAATAACGATACTCTGGAAAGTATTGAAAGTTAATTTGTGAGTGGTCGCACATATCCTGTT'),
    ]

# test data consists of first 9 sequences of training data
test = train[:9]

In [9]:
from sklearn.cluster import KMeans
km = KMeans()

wl=Weblogo(color_scheme = 'classic')

smod = SMoDWrapper(alphabet='dna',
                 complexity = 3,
                 n_clusters = 3 * 3,
                 clusterer = KMeans(),
                 pos_block_size = 5,
                 neg_block_size = 5,
                 min_score=4,
                 min_freq=0.5,
                 min_cluster_size=10,
                 similarity_th=.5,
                 freq_th=0.03,
                 weblogo_obj=wl)

smod.fit(seqs=train)


Setup 0.05 secs
Fitting
0 (5, 32769) (0.15 secs) (delta: 0.15)
1 (5, 32769) (0.17 secs) (delta: 0.01)
2 (5, 32769) (0.18 secs) (delta: 0.00)
3 (5, 32769) (0.18 secs) (delta: 0.00)
4 (5, 32769) (0.18 secs) (delta: 0.00)
5 (5, 32769) (0.18 secs) (delta: 0.00)
6 (5, 32769) (0.19 secs) (delta: 0.00)
7 (5, 32769) (0.19 secs) (delta: 0.00)
8 (5, 32769) (0.19 secs) (delta: 0.00)
9 (5, 32769) (0.19 secs) (delta: 0.00)
10 (5, 32769) (0.20 secs) (delta: 0.00)
11 (5, 32769) (0.20 secs) (delta: 0.00)
12 (5, 32769) (0.20 secs) (delta: 0.00)
13 (5, 32769) (0.20 secs) (delta: 0.00)
Setup 0.07 secs
Annotating
0 (0.27 secs) (delta: 0.27)
1 (0.27 secs) (delta: 0.00)
2 (0.27 secs) (delta: 0.00)
3 (0.27 secs) (delta: 0.00)
4 (0.27 secs) (delta: 0.00)
5 (0.29 secs) (delta: 0.02)
6 (0.30 secs) (delta: 0.01)
7 (0.30 secs) (delta: 0.00)
8 (0.39 secs) (delta: 0.09)
9 (0.39 secs) (delta: 0.00)
10 (0.40 secs) (delta: 0.01)
11 (0.44 secs) (delta: 0.04)
12 (0.44 secs) (delta: 0.00)
13 (0.46 secs) (delta: 0.02)
ECDF fit on 141 values
Optimal params: a:20.68  b:6.08
Setup 0.05 secs
Annotating
0 (0.17 secs) (delta: 0.17)
1 (0.17 secs) (delta: 0.00)
2 (0.21 secs) (delta: 0.04)
3 (0.21 secs) (delta: 0.00)
4 (0.26 secs) (delta: 0.04)
5 (0.26 secs) (delta: 0.00)
6 (0.26 secs) (delta: 0.00)
7 (0.28 secs) (delta: 0.02)
8 (0.39 secs) (delta: 0.11)
9 (0.39 secs) (delta: 0.00)
10 (0.39 secs) (delta: 0.00)
11 (0.42 secs) (delta: 0.02)
12 (0.42 secs) (delta: 0.00)
13 (0.43 secs) (delta: 0.01)
Working on: 76 fragments
Setup 0.03 secs
Vectorizing
0 (9, 32769) (0.01 secs) (delta: 0.01)
1 (9, 32769) (0.01 secs) (delta: 0.00)
2 (9, 32769) (0.01 secs) (delta: 0.00)
3 (9, 32769) (0.02 secs) (delta: 0.00)
4 (9, 32769) (0.02 secs) (delta: 0.00)
5 (9, 32769) (0.02 secs) (delta: 0.00)
6 (9, 32769) (0.02 secs) (delta: 0.00)
7 (9, 32769) (0.02 secs) (delta: 0.00)
Clustering
working on 72 instances
...done  in 0.23 secs
After clustering, 9 motives
Alignment
Cluster 0 (#11) (0.02 secs)
Cluster 1 (#22) (0.02 secs)
Cluster 4 (#12) (0.01 secs)
After motives computation, 3 motives
Joining: 0 (#11), 4 (#12) score: 0.67 deleting: 4  [0 is now #23]
After merge, 2 motives
After quality filter, 2 motives

In [10]:
for i in smod.original_motives_list:
    for j in i:
        print j
    print


('cya<loc>77:85<loc>', 'ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', 'ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', 'ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', 'ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')

('ara<loc>2:9<loc>', 'CAAAAAC')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAAC')


In [11]:
for i in smod.aligned_motives_list:
    for j in i:
        print j
    print


('cya<loc>77:85<loc>', '--ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', '--ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', '--ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTTAAAC')
('cya<loc>77:85<loc>', '--ATTTTTTC')
('ompa<loc>35:45<loc>', 'CTTTTTTTTC')
('ce1cg<loc>58:67<loc>', '-GTTTTTTTG')
('ce1cg<loc>14:23<loc>', '-GTTTTTGTG')
('ilv<loc>11:20<loc>', '-GTTTTTTGT')
('ce1cg<loc>58:67<loc>', '-GTTTTTTTG')
('ce1cg<loc>14:23<loc>', '-GTTTTTGTG')
('ilv<loc>11:20<loc>', '-GTTTTTTGT')
('ce1cg<loc>58:67<loc>', '-GTTTTTTTG')
('ce1cg<loc>14:23<loc>', '-GTTTTTGTG')
('ilv<loc>11:20<loc>', '-GTTTTTTGT')
('ce1cg<loc>58:67<loc>', '-GTTTTTTTG')
('ce1cg<loc>14:23<loc>', '-GTTTTTGTG')
('ilv<loc>11:20<loc>', '-GTTTTTTGT')

('ara<loc>2:9<loc>', 'CAAAAAC--')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC-')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG-')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC--')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC-')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG-')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC--')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC-')
('malt<loc>69:78<loc>', 'TAAAAAAAC')
('ompa<loc>5:13<loc>', 'CAAAAAAG-')
('tnaa<loc>39:48<loc>', 'TAAAAAAAG')
('ara<loc>2:9<loc>', 'CAAAAAC--')
('cya<loc>95:104<loc>', 'TAAAAAAAC')
('gale<loc>5:13<loc>', 'TAAAAAAC-')
('malt<loc>69:78<loc>', 'TAAAAAAAC')


In [12]:
for i in smod.motives_list:
    for j in i:
        print j
    print


('cya<loc>77:85<loc>', '-ATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'GATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')

('ara<loc>2:9<loc>', 'CAAAAACC')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')


In [13]:
predictions = smod.predict(input_seqs=test, return_list=True)
for p in predictions: print p


[0, 0]
[1]
[]
[]
[0, 1]
[]
[1]
[]
[]

In [14]:
predictions = smod.predict(input_seqs=test, return_list=False)
for p in predictions: print p


2
1
0
0
2
0
1
0
0

In [15]:
match = smod.transform(input_seqs=test, return_match=True)
for m in match: print m


[[(58, 68, 0.008228509929836647), (60, 70, 0.0001124991591969854)], []]
[[], [(3, 12, 0.0028686565125332964)]]
[[], []]
[[], []]
[[(67, 77, 0.00010607063581430051)], [(96, 105, 0.101427498121713)]]
[[], []]
[[], [(6, 15, 0.013831022471142682)]]
[[], []]
[[], []]

In [16]:
match = smod.transform(input_seqs=test, return_match=False)
for m in match: print m


[1, 0]
[0, 1]
[0, 0]
[0, 0]
[1, 1]
[0, 0]
[0, 1]
[0, 0]
[0, 0]

In [17]:
smod2= SMoDWrapper(alphabet='dna',
                 complexity = 3,
                 n_clusters = 3 * 3,
                 clusterer = KMeans(),
                 pos_block_size = 5,
                 neg_block_size = 5,
                 weblogo_obj=wl)
predictions = smod2.fit_predict(seqs=train)
for p in predictions: print p


Setup 0.06 secs
Fitting
0 (5, 32769) (0.14 secs) (delta: 0.14)
1 (5, 32769) (0.15 secs) (delta: 0.01)
2 (5, 32769) (0.15 secs) (delta: 0.00)
3 (5, 32769) (0.15 secs) (delta: 0.00)
4 (5, 32769) (0.15 secs) (delta: 0.00)
5 (5, 32769) (0.17 secs) (delta: 0.01)
6 (5, 32769) (0.17 secs) (delta: 0.00)
7 (5, 32769) (0.17 secs) (delta: 0.00)
8 (5, 32769) (0.17 secs) (delta: 0.00)
9 (5, 32769) (0.17 secs) (delta: 0.00)
10 (5, 32769) (0.18 secs) (delta: 0.00)
11 (5, 32769) (0.19 secs) (delta: 0.01)
12 (5, 32769) (0.20 secs) (delta: 0.01)
13 (5, 32769) (0.21 secs) (delta: 0.00)
Setup 0.06 secs
Annotating
0 (0.24 secs) (delta: 0.24)
1 (0.24 secs) (delta: 0.00)
2 (0.24 secs) (delta: 0.00)
3 (0.28 secs) (delta: 0.04)
4 (0.28 secs) (delta: 0.00)
5 (0.28 secs) (delta: 0.00)
6 (0.31 secs) (delta: 0.03)
7 (0.31 secs) (delta: 0.00)
8 (0.38 secs) (delta: 0.07)
9 (0.43 secs) (delta: 0.05)
10 (0.44 secs) (delta: 0.01)
11 (0.44 secs) (delta: 0.00)
12 (0.45 secs) (delta: 0.00)
13 (0.45 secs) (delta: 0.01)
ECDF fit on 136 values
Optimal params: a:22.81  b:6.57
Setup 0.07 secs
Annotating
0 (0.19 secs) (delta: 0.19)
1 (0.23 secs) (delta: 0.04)
2 (0.26 secs) (delta: 0.03)
3 (0.26 secs) (delta: 0.00)
4 (0.26 secs) (delta: 0.00)
5 (0.26 secs) (delta: 0.00)
6 (0.37 secs) (delta: 0.10)
7 (0.37 secs) (delta: 0.00)
8 (0.41 secs) (delta: 0.05)
9 (0.41 secs) (delta: 0.00)
10 (0.43 secs) (delta: 0.02)
11 (0.45 secs) (delta: 0.02)
12 (0.47 secs) (delta: 0.02)
13 (0.47 secs) (delta: 0.00)
Working on: 68 fragments
Setup 0.04 secs
Vectorizing
0 (8, 32769) (0.00 secs) (delta: 0.00)
1 (8, 32769) (0.01 secs) (delta: 0.01)
2 (8, 32769) (0.01 secs) (delta: 0.00)
3 (8, 32769) (0.01 secs) (delta: 0.00)
4 (8, 32769) (0.01 secs) (delta: 0.00)
5 (8, 32769) (0.02 secs) (delta: 0.00)
6 (8, 32769) (0.02 secs) (delta: 0.00)
7 (8, 32769) (0.02 secs) (delta: 0.00)
Clustering
working on 64 instances
...done  in 0.25 secs
After clustering, 9 motives
Alignment
Cluster 0 (#23) (0.02 secs)
After motives computation, 1 motives
After merge, 1 motives
After quality filter, 1 motives
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0

In [18]:
smod2= SMoDWrapper(alphabet='dna',
                 complexity = 3,
                 n_clusters = 3 * 3,
                 clusterer = KMeans(),
                 pos_block_size = 5,
                 neg_block_size = 5,
                 weblogo_obj=wl)
matches = smod2.fit_transform(seqs=train)
for m in matches: print m


Setup 0.05 secs
Fitting
0 (5, 32769) (0.14 secs) (delta: 0.14)
1 (5, 32769) (0.15 secs) (delta: 0.01)
2 (5, 32769) (0.15 secs) (delta: 0.00)
3 (5, 32769) (0.17 secs) (delta: 0.01)
4 (5, 32769) (0.17 secs) (delta: 0.00)
5 (5, 32769) (0.18 secs) (delta: 0.00)
6 (5, 32769) (0.18 secs) (delta: 0.00)
7 (5, 32769) (0.18 secs) (delta: 0.00)
8 (5, 32769) (0.20 secs) (delta: 0.01)
9 (5, 32769) (0.20 secs) (delta: 0.00)
10 (5, 32769) (0.21 secs) (delta: 0.01)
11 (5, 32769) (0.22 secs) (delta: 0.01)
12 (5, 32769) (0.22 secs) (delta: 0.00)
13 (5, 32769) (0.22 secs) (delta: 0.00)
Setup 0.07 secs
Annotating
0 (0.26 secs) (delta: 0.26)
1 (0.26 secs) (delta: 0.00)
2 (0.26 secs) (delta: 0.00)
3 (0.26 secs) (delta: 0.00)
4 (0.26 secs) (delta: 0.00)
5 (0.32 secs) (delta: 0.06)
6 (0.32 secs) (delta: 0.00)
7 (0.32 secs) (delta: 0.00)
8 (0.35 secs) (delta: 0.02)
9 (0.37 secs) (delta: 0.03)
10 (0.38 secs) (delta: 0.00)
11 (0.41 secs) (delta: 0.04)
12 (0.42 secs) (delta: 0.00)
13 (0.44 secs) (delta: 0.02)
ECDF fit on 97 values
Optimal params: a:15.37  b:4.36
Setup 0.06 secs
Annotating
0 (0.20 secs) (delta: 0.20)
1 (0.20 secs) (delta: 0.00)
2 (0.23 secs) (delta: 0.03)
3 (0.23 secs) (delta: 0.00)
4 (0.23 secs) (delta: 0.00)
5 (0.27 secs) (delta: 0.04)
6 (0.27 secs) (delta: 0.00)
7 (0.28 secs) (delta: 0.00)
8 (0.37 secs) (delta: 0.09)
9 (0.38 secs) (delta: 0.01)
10 (0.41 secs) (delta: 0.02)
11 (0.41 secs) (delta: 0.00)
12 (0.43 secs) (delta: 0.02)
13 (0.43 secs) (delta: 0.00)
Working on: 48 fragments
Setup 0.04 secs
Vectorizing
0 (6, 32769) (0.01 secs) (delta: 0.01)
1 (6, 32769) (0.01 secs) (delta: 0.00)
2 (6, 32769) (0.01 secs) (delta: 0.00)
3 (6, 32769) (0.01 secs) (delta: 0.00)
4 (6, 32769) (0.02 secs) (delta: 0.00)
5 (6, 32769) (0.02 secs) (delta: 0.00)
6 (6, 32769) (0.02 secs) (delta: 0.00)
7 (6, 32769) (0.02 secs) (delta: 0.00)
Clustering
working on 48 instances
...done  in 0.24 secs
After clustering, 9 motives
Alignment
Cluster 6 (#12) (0.01 secs)
After motives computation, 1 motives
After merge, 1 motives
After quality filter, 1 motives
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[1]
[0]
[0]
[0]
[0]

In [19]:
#printing motives as lists
for motif in smod.motives_list:
    for m in motif:
        print m
    print


('cya<loc>77:85<loc>', '-ATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'GATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')

('ara<loc>2:9<loc>', 'CAAAAACC')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')


In [20]:
smod.display_logo(do_alignment=False)



In [21]:
smod.display_logo(motif_num=1)



In [22]:
smod.align_motives()    #MSA with Muscle
motives1=smod.aligned_motives_list
for m in motives1:
    for i in m:
        print i
    print


('cya<loc>77:85<loc>', 'ATTTTTTC-')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'GATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('tnaa<loc>0:10<loc>', 'TTTTTAAAC')
('cya<loc>77:85<loc>', 'TATTTTTTC')
('ompa<loc>35:45<loc>', 'TTTTTTTTC')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')
('ce1cg<loc>58:67<loc>', 'GTTTTTTTG')
('ce1cg<loc>14:23<loc>', 'GTTTTTGTG')
('ilv<loc>11:20<loc>', 'GTTTTTTGT')

('ara<loc>2:9<loc>', 'CAAAAACC')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')
('ompa<loc>5:13<loc>', 'CAAAAAAG')
('tnaa<loc>39:48<loc>', 'TAAAAAAA')
('ara<loc>2:9<loc>', 'CAAAAACA')
('cya<loc>95:104<loc>', 'TAAAAAAA')
('gale<loc>5:13<loc>', 'TAAAAAAC')
('malt<loc>69:78<loc>', 'TAAAAAAA')


In [23]:
smod.display()


        0      1      2      3      4      5      6      7      8      9
-:   0.70   0.17   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
A:   0.00   0.00   0.17   0.00   0.00   0.00   0.13   0.13   0.13   0.00
C:   0.17   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.48
G:   0.00   0.52   0.00   0.00   0.00   0.00   0.00   0.17   0.17   0.35
T:   0.13   0.30   0.83   1.00   1.00   1.00   0.87   0.70   0.70   0.17

        0      1      2      3      4      5      6      7      8
-:   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.18   0.50
A:   0.00   1.00   1.00   1.00   1.00   1.00   0.82   0.50   0.00
C:   0.32   0.00   0.00   0.00   0.00   0.00   0.18   0.18   0.36
G:   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.14   0.14
T:   0.68   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00


In [24]:
smod.display(motif_num=2)


        0      1      2      3      4      5      6      7      8
-:   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.18   0.50
A:   0.00   1.00   1.00   1.00   1.00   1.00   0.82   0.50   0.00
C:   0.32   0.00   0.00   0.00   0.00   0.00   0.18   0.18   0.36
G:   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.14   0.14
T:   0.68   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00


In [25]:
# Score a test sequence using probability score
test_seq = 'ACGT' * 10
seq_score = smod.score(motif_num=1, seq=test_seq)
print seq_score


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

Example with HMM scoring criteria


In [27]:
meme2 = SMoDWrapper(alphabet="dna",
                    scoring_criteria="hmm", 
                    k=1, 
                    threshold=0.5,
                    complexity = 3,
                    n_clusters = 3 * 3,
                    clusterer = KMeans(),
                    pos_block_size = 5,
                    neg_block_size = 5,
                    min_score=4,
                    min_freq=0.5,
                    min_cluster_size=10,
                    similarity_th=.5,
                    freq_th=0.03)
matches = meme2.fit_transform(seqs=train, return_match=True)
for m in matches: print m


Setup 0.05 secs
Fitting
0 (5, 32769) (0.10 secs) (delta: 0.10)
1 (5, 32769) (0.14 secs) (delta: 0.04)
2 (5, 32769) (0.16 secs) (delta: 0.01)
3 (5, 32769) (0.18 secs) (delta: 0.01)
4 (5, 32769) (0.18 secs) (delta: 0.00)
5 (5, 32769) (0.19 secs) (delta: 0.00)
6 (5, 32769) (0.19 secs) (delta: 0.00)
7 (5, 32769) (0.19 secs) (delta: 0.00)
8 (5, 32769) (0.19 secs) (delta: 0.00)
9 (5, 32769) (0.20 secs) (delta: 0.00)
10 (5, 32769) (0.20 secs) (delta: 0.00)
11 (5, 32769) (0.20 secs) (delta: 0.00)
12 (5, 32769) (0.20 secs) (delta: 0.00)
13 (5, 32769) (0.20 secs) (delta: 0.00)
Setup 0.07 secs
Annotating
0 (0.15 secs) (delta: 0.15)
1 (0.20 secs) (delta: 0.05)
2 (0.20 secs) (delta: 0.00)
3 (0.21 secs) (delta: 0.01)
4 (0.23 secs) (delta: 0.02)
5 (0.23 secs) (delta: 0.00)
6 (0.23 secs) (delta: 0.00)
7 (0.24 secs) (delta: 0.00)
8 (0.33 secs) (delta: 0.09)
9 (0.37 secs) (delta: 0.04)
10 (0.37 secs) (delta: 0.00)
11 (0.39 secs) (delta: 0.02)
12 (0.40 secs) (delta: 0.01)
13 (0.40 secs) (delta: 0.00)
ECDF fit on 79 values
Optimal params: a:13.14  b:4.23
Setup 0.05 secs
Annotating
0 (0.16 secs) (delta: 0.16)
1 (0.17 secs) (delta: 0.01)
2 (0.20 secs) (delta: 0.03)
3 (0.20 secs) (delta: 0.00)
4 (0.20 secs) (delta: 0.00)
5 (0.26 secs) (delta: 0.06)
6 (0.26 secs) (delta: 0.00)
7 (0.26 secs) (delta: 0.00)
8 (0.35 secs) (delta: 0.09)
9 (0.35 secs) (delta: 0.00)
10 (0.38 secs) (delta: 0.02)
11 (0.38 secs) (delta: 0.00)
12 (0.38 secs) (delta: 0.00)
13 (0.39 secs) (delta: 0.01)
Working on: 40 fragments
Setup 0.03 secs
Vectorizing
0 (5, 32769) (0.00 secs) (delta: 0.00)
1 (5, 32769) (0.01 secs) (delta: 0.00)
2 (5, 32769) (0.01 secs) (delta: 0.00)
3 (5, 32769) (0.01 secs) (delta: 0.00)
4 (5, 32769) (0.01 secs) (delta: 0.00)
5 (5, 32769) (0.01 secs) (delta: 0.00)
6 (5, 32769) (0.01 secs) (delta: 0.00)
7 (5, 32769) (0.01 secs) (delta: 0.00)
Clustering
working on 40 instances
...done  in 0.21 secs
After clustering, 9 motives
Alignment
After motives computation, 0 motives
After merge, 0 motives
Quality filter is too strict. Ignoring filter.
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-27-63c6a5d04d0e> in <module>()
     13                     similarity_th=.5,
     14                     freq_th=0.03)
---> 15 matches = meme2.fit_transform(seqs=train, return_match=True)
     16 for m in matches: print m

/home/zr/Dropbox/HiWi/pyMotif/smod_wrapper.py in fit_transform(self, seqs, neg_seqs, return_match)
    147                       ):
    148         """Find motives and return motif match list."""
--> 149         self.fit(seqs=seqs, neg_seqs=neg_seqs)
    150         return self.transform(input_seqs=seqs, return_match=return_match)

/home/zr/Dropbox/HiWi/pyMotif/smod_wrapper.py in fit(self, seqs, neg_seqs)
    122         self.nmotifs = len(motives.keys())
    123         if self.nmotifs == 0:
--> 124             raise AttributeError("0 motives found.")
    125         self.original_motives_list = self._get_motives_list(motives)[:]
    126         self.aligned_motives_list = self._get_aligned_motives_list(

AttributeError: 0 motives found.

In [29]:
%%time
# Markov Model score
mm_score = smod.score(motif_num=2, seq="ACGT"*10)
print mm_score


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.22 ms

In [ ]: