In [10]:
%matplotlib inline
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)
given the RFAM id of a family we retrieve it from the RFAM online database composing the correspondent URL
In [11]:
def rfam_uri(family_id):
return '%s.fa'%(family_id)
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
In [12]:
rfam_id = 'RF00162' #SAM riboswitch (S box leader)
rfam_id = 'RF01725' #SAM-I/IV variant riboswitch
rfam_id = 'RF00005' #tRNA
rfam_id = 'RF02276' #hammerhead
prepare a function that composes all desired pre-processing steps over the original data
In [13]:
def pre_process( data ):
from eden.converter.rna.rnafold import rnafold_to_eden
graphs = rnafold_to_eden( data )
return graphs
employ the vectorizer to transform the graph instances into sparse vectors and collect all the vectors in a data matrix
repeat the procedure on a different RNA family that will be used as negative class (...or randomly shuffle the sequences to create the negative examples)
concatenate the two data matrices into one matrix and create a target vector that identifies with +1 and -1 the instances comping form the orginal and the shuffled data
Fit a predictor
In [19]:
def model(uri, pre_process, size=None):
from eden.graph import Vectorizer
vectorizer = Vectorizer( complexity=2 )
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(uri)
from itertools import tee, islice
if size is not None:
seqs = islice(seqs,size)
seqs, seqs_ = tee(seqs)
pos_graphs = pre_process( seqs )
from eden.modifier.seq import seq_to_seq, shuffle_modifier
seqs_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=4, order=2 )
neg_graphs = pre_process( seqs_neg )
from eden.util import fit
estimator = fit(pos_graphs, neg_graphs, vectorizer, n_jobs=-1, cv=10, n_iter_search=20)
return estimator, vectorizer
In [20]:
%%time
estimator, vectorizer = model(rfam_uri( rfam_id ), pre_process, size=50)
Setup the Annotator
the Annotator takes in input a vecotrizer and a predictor
annotate with colors the discriminative importance of different regions of the structural graphs
In [27]:
from eden.util.display import draw_graph
import itertools
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(rfam_uri( rfam_id ))
graphs = itertools.islice(pre_process(seqs),7,8)
graphs = vectorizer.annotate( graphs, estimator = estimator )
#parameters for visualization
opts={'size':9, 'node_border':False, 'node_size':200, 'font_size':9, 'vertex_alpha':0.6, 'title_key':'id'}
for i, graph in enumerate(graphs):
print i
#draw_graph(graph, vertex_color='importance', colormap='YlOrRd', **opts)
draw_graph(graph, vertex_color='importance', colormap='YlOrRd', file_name='rna_%d.pdf'%(i), **opts)
In [ ]: