In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from eden.util import configure_logging
import logging
logger = logging.getLogger()
configure_logging(logger,verbosity=2)
In [3]:
from smod_wrapper import SMoDWrapper
from eden.sequence_motif_decomposer import SequenceMotifDecomposer as SMoD
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import roc_auc_score
In [4]:
import random
def random_string(length,alphabet_list):
rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
return rand_str
def perturb(seed,alphabet_list,p=0.5):
seq=''
for c in seed:
if random.random() < p: c = random.choice(alphabet_list)
seq += c
return seq
def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6,
sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
random_state=1):
random.seed(random_state)
alphabet_list=[c for c in alphabet]
if motives is None:
motives=[]
for i in range(n_motives):
motives.append(random_string(motif_length,alphabet_list))
else:
motif_length = len(motives[0])
n_motives = len(motives)
sequence_length = sequence_length / len(motives)
flanking_length = (sequence_length - motif_length ) / 2
n_seq_per_motif = n_sequences
counter=0
seqs=[]
for i in range(n_seq_per_motif):
total_seq = ''
total_binary_seq=''
for j in range(n_motives):
left_flanking = random_string(flanking_length,alphabet_list)
right_flanking = random_string(flanking_length,alphabet_list)
noisy_motif = perturb(motives[j],alphabet_list,p)
seq = left_flanking + noisy_motif + right_flanking
total_seq += seq
seqs.append(('ID%d'%counter,total_seq))
counter += 1
binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
binary_seq = binary_skeleton * n_motives
return motives, seqs, binary_seq
In [5]:
n_sequences = 100
motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
#motives=motives,
sequence_length=500,
n_sequences=n_sequences,
motif_length=10,
n_motives=4,
p=0.2)
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=1, order=2)
neg_seqs = list(neg_seqs)
block_size=n_sequences/8
pos_size = len(pos_seqs)
train_pos_seqs = pos_seqs[:pos_size/2]
test_pos_seqs = pos_seqs[pos_size/2:]
neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]
true_score = [float(int(i)) for i in binary_seq]
In [14]:
complexity=5
n_clusters=10
min_subarray_size=4
max_subarray_size=10
#estimator=SGDClassifier(warm_start=True)
clusterer=KMeans()
pos_block_size=40
neg_block_size=40
n_jobs=-1
p_value=0.05
similarity_th=0.5
min_score=4
min_freq=0.5
min_cluster_size=10
regex_th=0.3
sample_size=200
freq_th=None
std_th=None
In [15]:
#%%time
smod = SMoD(complexity=complexity,
n_clusters=n_clusters,
min_subarray_size=min_subarray_size,
max_subarray_size=max_subarray_size,
#estimator=estimator,
clusterer=clusterer,
pos_block_size=pos_block_size,
neg_block_size=neg_block_size,
n_jobs=n_jobs)
smod = smod.fit(train_pos_seqs, train_neg_seqs)
motives = smod.select_motives(train_pos_seqs,
p_value,
similarity_th,
min_score,
min_freq,
min_cluster_size,
regex_th,
sample_size,
freq_th,
std_th)
In [16]:
scores = list(smod.score(test_pos_seqs))
In [17]:
print len(scores)
print len(test_pos_seqs)
In [24]:
#%%time
smod_wrapper = SMoDWrapper(alphabet='dna',
scoring_criteria = 'pwm',
complexity = complexity,
n_clusters = n_clusters,
min_subarray_size = min_subarray_size,
max_subarray_size = max_subarray_size,
pos_block_size = block_size,
neg_block_size = block_size,
clusterer = KMeans(),
min_score = min_score,
min_freq = min_freq,
min_cluster_size = min_cluster_size,
similarity_th = similarity_th,
freq_th = freq_th,
p_value=p_value,
regex_th=regex_th,
sample_size=sample_size,
std_th=std_th)
smod_wrapper.fit(pos_seqs, neg_seqs)
In [25]:
seq_scr = []
for k in range(smod_wrapper.nmotifs):
scr = []
for l in range(len(test_pos_seqs)):
scr=smod_wrapper.score(motif_num=k+1, seq=test_pos_seqs[l][1])
seq_scr.append(scr)
In [ ]:
In [13]:
mean_score = np.mean(scores, axis=0)
roc_score = roc_auc_score(true_score, mean_score)
print "for SMoD scoring: ",roc_score
mean_score = np.mean(seq_scr, axis=0)
roc_score = roc_auc_score(true_score, mean_score)
print "for SMoD wrapper: ",roc_score
In [ ]: