Given a large set of sequences or graphs with ordered vertices find small vertex ordered subsequences that are most discriminative for the set.
Steps:
Output:
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
#code for making artificial dataset
import random
def random_string(length,alphabet_list):
rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
return rand_str
def perturb(seed,alphabet_list,p=0.5):
seq=''
for c in seed:
if random.random() < p: c = random.choice(alphabet_list)
seq += c
return seq
def make_artificial_dataset(alphabet='ACGU', motives=None, motif_length=6, sequence_length=100, n_sequences=1000, n_motives=2, p=0.2):
alphabet_list=[c for c in alphabet]
if motives is None:
motives=[]
for i in range(n_motives):
motives.append(random_string(motif_length,alphabet_list))
else:
motif_length = len(motives[0])
n_motives = len(motives)
flanking_length = (sequence_length - motif_length ) / 2
n_seq_per_motif = n_sequences / n_motives
counter=0
seqs=[]
for i in range(n_seq_per_motif):
for j in range(n_motives):
left_flanking = random_string(flanking_length,alphabet_list)
right_flanking = random_string(flanking_length,alphabet_list)
noisy_motif = perturb(motives[j],alphabet_list,p)
seq = left_flanking + noisy_motif + right_flanking
seqs.append(('>ID%d'%counter,seq))
counter += 1
return motives, seqs
In [3]:
from eden.motif import SequenceMotif
help(SequenceMotif)
In [4]:
#setup parameters
alphabet='ACGU'
motives=['AAAAAAAAAA','CCCCCCCCCC','GGGGGGGGGG','UUUUUUUUUU']
sequence_length=100
n_sequences=100
p=0.3
#make dataset
motives, seqs = make_artificial_dataset(alphabet=alphabet,motives=motives,sequence_length=sequence_length,n_sequences=n_sequences,p=p)
#display
print 'Motives and sample of their perturbed variants:'
alphabet_list=[c for c in alphabet]
for motif in motives:
print
print motif,
for i in range(9):
print perturb(motif,alphabet_list,p=p),
In [11]:
#save to file
fname='artificial_motif_search_dataset.fa'
with open(fname,'w') as f:
for header,seq in seqs:
f.write(header+"\n")
f.write(seq+"\n")
#save explicit negative sequences
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = list(seq_to_seq(seqs, modifier=shuffle_modifier, times=2, order=2))
fname='artificial_motif_search_dataset_negatives.fa'
with open(fname,'w') as f:
for header,seq in neg_seqs:
f.write(header+"\n")
f.write(seq+"\n")
In [12]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)
In [15]:
%%time
from sklearn.cluster import Birch
ca = Birch(threshold=0.1, n_clusters=4, branching_factor=50)
from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, complexity=2, nbits=14, clustering_algorithm=ca)
seqmot.fit(seqs, neg_seqs)
seqmot.save('seqmot')
In [16]:
for cluster_id in seqmot.motives_db:
print cluster_id
for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
print motif, count
In [17]:
from eden.motif import SequenceMotif
seqmot2 = SequenceMotif()
seqmot2.load('seqmot')
predictions=seqmot2.predict(seqs, return_list=True)
for p in predictions: print p
In [18]:
predictions=seqmot2.predict(seqs, return_list=False)
for p in predictions: print p
In [19]:
predictions=seqmot2.transform(seqs, return_match=True)
for p in predictions: print p
In [20]:
predictions=seqmot2.transform(seqs, return_match=False)
for p in predictions: print p
In [21]:
%%time
from sklearn.cluster import MiniBatchKMeans
ca = MiniBatchKMeans(n_clusters=4)
from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, clustering_algorithm=ca)
seqmot.fit(seqs)
for cluster_id in seqmot.motives_db:
print cluster_id
for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
print motif, count
In [23]:
%%time
from sklearn.cluster import DBSCAN
ca = DBSCAN(eps=0.2, min_samples=3)
from eden.motif import SequenceMotif
seqmot = SequenceMotif(training_size=100, clustering_algorithm=ca)
seqmot.fit(seqs)
for cluster_id in seqmot.motives_db:
print cluster_id
for count, motif in sorted(seqmot.motives_db[cluster_id], reverse=True):
print motif, count