In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import random
def random_string(length,alphabet_list):
rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
return rand_str
def perturb(seed,alphabet_list,p=0.5):
seq=''
for c in seed:
if random.random() < p: c = random.choice(alphabet_list)
seq += c
return seq
def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6,
sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
random_state=1):
random.seed(random_state)
alphabet_list=[c for c in alphabet]
if motives is None:
motives=[]
for i in range(n_motives):
motives.append(random_string(motif_length,alphabet_list))
else:
motif_length = len(motives[0])
n_motives = len(motives)
sequence_length = sequence_length / len(motives)
flanking_length = (sequence_length - motif_length ) / 2
n_seq_per_motif = n_sequences
counter=0
seqs=[]
for i in range(n_seq_per_motif):
total_seq = ''
total_binary_seq=''
for j in range(n_motives):
left_flanking = random_string(flanking_length,alphabet_list)
right_flanking = random_string(flanking_length,alphabet_list)
noisy_motif = perturb(motives[j],alphabet_list,p)
seq = left_flanking + noisy_motif + right_flanking
total_seq += seq
seqs.append(('ID%d'%counter,total_seq))
counter += 1
binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
binary_seq = binary_skeleton * n_motives
return motives, seqs, binary_seq
In [37]:
from eden.motif import SequenceMotif
#help(SequenceMotif)
In [53]:
#setup parameters
alphabet='ACGT'
semi_len=5
motives=['A'*semi_len+'C'*semi_len,
#'C'*semi_len+'A'*semi_len,
#'A'*semi_len+'T'*semi_len,
'T'*semi_len+'A'*semi_len]
sequence_length=40
n_sequences=200
p=0.8
n_motives = 2
motif_length = 10
#make dataset
motives, seqs, binary_seqs = make_artificial_dataset(alphabet=alphabet,
motives=motives,
#n_motives = n_motives,
#motif_length = motif_length,
sequence_length=sequence_length,
n_sequences=n_sequences,
p=p,
random_state=8)
In [ ]:
In [3]:
from utilities import Weblogo
from IPython.display import Image, display
In [63]:
img = Weblogo().create_logo(seqs)
display(Image(img))
In [41]:
# display first 10 sequences and the corresponding binary sequences
for i in range(10):
print seqs[i][0]
print seqs[i][1]
print binary_seqs[i][1]
print
In [42]:
#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives:
print
print 'true motif:', motif, ' noisy: ',
for i in range(4):
print perturb(motif,alphabet_list,p=p),
In [4]:
from sklearn.cluster import KMeans
from eden_wrapper import EdenWrapper
from meme_wrapper import Meme
from smod_wrapper import SMoDWrapper
In [44]:
# Parameters for tools
alphabet = 'dna' # Sequence type
scoring_criteria = "pwm" # ["pwm", "hmm"]
nmotifs = 4 # No. of motives to be found
minw = len(motives[0])-2 # Minimum width of motif
maxw = len(motives[0])+2 # Maximum width of motif
from utilities import Weblogo
wl = Weblogo(output_format='png',
sequence_type='dna',
resolution=200,
stacks_per_line=40,
units='probability')
In [9]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)
In [10]:
%%time
km = KMeans(n_clusters=nmotifs)
tool = EdenWrapper(alphabet=alphabet,
scoring_criteria = scoring_criteria,
complexity=5,
nbits=14,
negative_ratio=3,
min_subarray_size=minw,
max_subarray_size=maxw,
clustering_algorithm=km,
weblogo_obj=wl)
tool.fit(seqs)
In [94]:
%%time
# save input sequences as fasta file for MEME tool
with open('seqs.fa','w') as f_train:
for seq in seqs:
f_train.write('>' + seq[0] + ' \n')
f_train.write(seq[1] + '\n')
tool = Meme(alphabet='dna',
scoring_criteria = scoring_criteria,
minw=minw,
maxw=maxw,
nmotifs=nmotifs,
weblogo_obj=wl)
tool.fit('seqs.fa')
In [85]:
In [54]:
tool = SMoDWrapper(scoring_criteria = 'pwm',
#complexity = complexity,
#n_clusters = n_clusters,
min_subarray_size = 8,
max_subarray_size = 12,
pos_block_size = 20,
neg_block_size = 20,
clusterer = KMeans(),
min_score = 7,
min_freq = 0.69,
min_cluster_size = 1,
similarity_th = 0.5,
freq_th = 0.69,
p_value=1,
regex_th=0.92,
#sample_size=sample_size,
std_th=0.48
)
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(seqs, modifier=shuffle_modifier, times=1, order=2)
neg_seqs = list(neg_seqs)
block_size=n_sequences/8
pos_size = len(seqs)
train_pos_seqs = seqs[:pos_size/2]
test_pos_seqs = seqs[pos_size/2:]
neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]
tool.fit(seqs, neg_seqs)
#smod.display_logo()
In [55]:
tool.display()
In [99]:
% matplotlib inline
img = Weblogo().create_logo(seqs)
display(Image(img))
import pylab as plt
import numpy as np
sig = np.zeros(sequence_length)
print "Average over all sequences"
for j in range(len(motives)):
for i in range(len(seqs)):
score = tool.score(motif_num=j+1, seq=seqs[i][1])
sig += np.array(score)
sig /= float(len(seqs))
#sig *= len(motives)
plt.figure(figsize=(16,3))
plt.fill_between(range(len(sig)), 0, sig, alpha=0.3)
plt.show()
print
print "for individual sequences"
for i in range(len(seqs[:10])):
sig = np.zeros(sequence_length)
for j in range(len(motives)):
score = tool.score(motif_num=j+1, seq=seqs[i][1])
sig += np.array(score)
sig /= float(len(seqs))
sig *= len(motives)
plt.figure(figsize=(16,3))
plt.fill_between(range(len(sig)), 0, sig, alpha=0.3)
plt.xticks(range(len(sig)), seqs[i][1])
plt.show()
In [21]:
tool.display_logo(do_alignment=True)
In [22]:
%%time
test_seq = seqs[0][1]
score_pwm = tool.score(motif_num=1, seq=test_seq)
print score_pwm
In [23]:
import matplotlib.pyplot as plt
plt.plot(score_pwm)
plt.xlabel('Sequence Position')
plt.ylabel('Score')
plt.show()
In [28]:
from sklearn.metrics import roc_auc_score
true_score = [float(int(x)) for x in binary_seqs[0][1]]
roc_score = roc_auc_score(true_score, score_pwm)
print "ROC Score: ", roc_score
In [32]:
%%time
km = KMeans(n_clusters=nmotifs)
tool = EdenWrapper(alphabet=alphabet,
scoring_criteria = "hmm",
complexity=5,
nbits=14,
negative_ratio=3,
min_subarray_size=5,
max_subarray_size=7,
clustering_algorithm=km,
weblogo_obj=wl)
tool.fit(seqs)
In [33]:
%%time
test_seq = seqs[0][1]
score_mm = tool.score(motif_num=1, seq=test_seq)
print score_mm
In [34]:
import matplotlib.pyplot as plt
plt.plot(score_mm)
plt.xlabel('Sequence Position')
plt.ylabel('Score')
plt.show()
In [35]:
true_score = [float(int(x)) for x in binary_seqs[0][1]]
roc_score = roc_auc_score(true_score, score_mm)
print "ROC Score: ", roc_score
In [ ]: