In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from eden.util import configure_logging
import logging
logger = logging.getLogger()
configure_logging(logger,verbosity=2)

In [3]:
from smod_wrapper import SMoDWrapper
from eden.sequence_motif_decomposer import SequenceMotifDecomposer as SMoD
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import roc_auc_score

In [4]:
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [5]:
n_sequences = 100
motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                        #motives=motives,
                                                        sequence_length=500,
                                                        n_sequences=n_sequences,
                                                        motif_length=10,
                                                        n_motives=4,
                                                        p=0.2)
    
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=1, order=2)
neg_seqs = list(neg_seqs)

block_size=n_sequences/8

pos_size = len(pos_seqs)
train_pos_seqs = pos_seqs[:pos_size/2]
test_pos_seqs = pos_seqs[pos_size/2:]

neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]

true_score = [float(int(i)) for i in binary_seq]

In [14]:
complexity=5
n_clusters=10
min_subarray_size=4
max_subarray_size=10
#estimator=SGDClassifier(warm_start=True)
clusterer=KMeans()
pos_block_size=40
neg_block_size=40
n_jobs=-1
p_value=0.05
similarity_th=0.5
min_score=4
min_freq=0.5
min_cluster_size=10
regex_th=0.3
sample_size=200
freq_th=None
std_th=None

In [15]:
#%%time
smod = SMoD(complexity=complexity,
            n_clusters=n_clusters,
            min_subarray_size=min_subarray_size,
            max_subarray_size=max_subarray_size,
            #estimator=estimator,
            clusterer=clusterer,
            pos_block_size=pos_block_size,
            neg_block_size=neg_block_size,
            n_jobs=n_jobs)
smod = smod.fit(train_pos_seqs, train_neg_seqs)
motives = smod.select_motives(train_pos_seqs,
                    p_value,
                    similarity_th,
                    min_score,
                    min_freq,
                    min_cluster_size,
                    regex_th,
                    sample_size,
                    freq_th,
                    std_th)


Setup 0.08 secs
Fitting
0 (40, 32769) (4.04 secs) (delta: 4.04)
Setup 0.05 secs
Annotating
0 (23.02 secs) (delta: 23.02)
ECDF fit on 149 values
Optimal params: a:15.38  b:2.67
Setup 0.06 secs
Annotating
0 (25.31 secs) (delta: 25.31)
Working on: 35 fragments
Setup 0.05 secs
Vectorizing
0 (4, 32769) (0.02 secs) (delta: 0.02)
1 (4, 32769) (0.02 secs) (delta: 0.00)
2 (4, 32769) (0.02 secs) (delta: 0.00)
3 (4, 32769) (0.02 secs) (delta: 0.00)
4 (4, 32769) (0.02 secs) (delta: 0.00)
5 (4, 32769) (0.03 secs) (delta: 0.00)
6 (4, 32769) (0.03 secs) (delta: 0.00)
7 (4, 32769) (0.03 secs) (delta: 0.00)
Clustering
working on 32 instances
...done  in 0.39 secs
After clustering, 10 motives
Alignment
After motives computation, 0 motives
After merge, 0 motives
Quality filter is too strict. Ignoring filter.

In [16]:
scores = list(smod.score(test_pos_seqs))


Setup 0.07 secs
Predicting
0 (6.57 secs) (delta: 6.57)

In [17]:
print len(scores)
print len(test_pos_seqs)


40
50

In [24]:
#%%time
smod_wrapper = SMoDWrapper(alphabet='dna',
                           scoring_criteria = 'pwm',
                           complexity = complexity,
                           n_clusters = n_clusters,
                           min_subarray_size = min_subarray_size,
                           max_subarray_size = max_subarray_size,
                           pos_block_size = block_size,
                           neg_block_size = block_size,
                           clusterer = KMeans(),
                           min_score = min_score,
                           min_freq = min_freq,
                           min_cluster_size = min_cluster_size,
                           similarity_th = similarity_th,
                           freq_th = freq_th,
                           p_value=p_value,
                           regex_th=regex_th,
                           sample_size=sample_size,
                           std_th=std_th)
smod_wrapper.fit(pos_seqs, neg_seqs)

In [25]:
seq_scr = []
for k in range(smod_wrapper.nmotifs):
    scr = []
    for l in range(len(test_pos_seqs)):
        scr=smod_wrapper.score(motif_num=k+1, seq=test_pos_seqs[l][1])
        seq_scr.append(scr)

In [ ]:


In [13]:
mean_score = np.mean(scores, axis=0)
roc_score = roc_auc_score(true_score, mean_score)
print "for SMoD scoring: ",roc_score


mean_score = np.mean(seq_scr, axis=0)
roc_score = roc_auc_score(true_score, mean_score)
print "for SMoD wrapper: ",roc_score


for SMoD scoring:  0.641611842105
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-38b958906ced> in <module>()
      5 
      6 
----> 7 mean_score = np.mean(seq_scr, axis=0)
      8 roc_score = roc_auc_score(true_score, mean_score)
      9 print "for SMoD wrapper: ",roc_score

NameError: name 'seq_scr' is not defined

In [ ]: