RNA family characterization

Application scenario: we want to characterize RNA family, identifying regions with their structural contexts that are characteristic of the family


In [1]:
%matplotlib inline

In [2]:
import logging
from eden.util import configure_logging
configure_logging(logging.getLogger(), verbosity=2)

given the RFAM id of a family we retrieve it from the RFAM online database composing the correspondent URL


In [3]:
def rfam_uri(family_id):
    return '%s.fa'%(family_id)
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

In [4]:
rfam_id = 'RF00871' #microRNA mir-689

In [5]:
rfam_id = 'RF00380' #

In [6]:
rfam_id = 'RF00005' #tRNA

In [7]:
rfam_id = 'RF02275' #Hammerhead_HH9

In [8]:
#parameters for visualization
opts={'size':14, 'node_border':False, 'node_size':200, 'font_size':9, 'vertex_alpha':0.6,'title_key':'id'}

from eden.util.display import draw_graph, serialize_graph
import itertools
def plot(graphs, num=3, file_name=None, opts=None):    
    graphs = itertools.islice(graphs,num)
    for i,graph in enumerate(graphs): 
        if file_name is not None:
            draw_graph(graph, file_name=file_name+str(i)+'.png', **opts)
        else:
            draw_graph(graph, **opts)

In [9]:
def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    
    from eden.converter.fasta import sequence_to_eden
    graphs = sequence_to_eden(seqs)

    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.colorize(graphs, output_attribute = 'level', labels = ['A','U','C','G'])
    
    return graphs

In [10]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'level', 'colormap':'Set3', 'prog':'circo', 'size_x_to_y_ratio':10})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [11]:
def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
   
    from eden.converter.rna.rnashapes_struct import rnashapes_struct_to_eden
    graphs = rnashapes_struct_to_eden(seqs, energy=True, shape=True, dotbracket=True, split_components=False, shape_type=5, energy_range=35, max_num=3)

    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.colorize(graphs, output_attribute = 'level', labels = ['.','(',')','[',']','x','0','1','2','3','4','5','6','7','8','9'])
    
    return graphs

In [12]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'level', 'colormap':'Set1'})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418
/Users/costa/anaconda/lib/python2.7/site-packages/matplotlib/text.py:52: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if rotation in ('horizontal', None):
/Users/costa/anaconda/lib/python2.7/site-packages/matplotlib/text.py:54: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  elif rotation == 'vertical':

In [13]:
def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden(seqs)

    from eden.modifier.graph import vertex_attributes
    graphs = vertex_attributes.trapezoidal_reweighting(graphs,
                                                       high_weight = 1, low_weight = 0.0, 
                                                       high_weight_window_start = 20, high_weight_window_end = 30, 
                                                       low_weight_window_start = 10, low_weight_window_end = 40)
    return graphs

In [14]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'weight', 'colormap':'summer', 'edge_color':'_labels_'})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [15]:
def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    
    from eden.converter.rna.rnaplfold import rnaplfold_to_eden
    graphs = rnaplfold_to_eden(seqs, window_size = 150, max_bp_span = 100, avg_bp_prob_cutoff = 0.01, max_num_edges = 3)

    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.colorize(graphs, output_attribute = 'level', labels = ['A','U','C','G'])

    return graphs

In [16]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'level', 'colormap':'Set3'})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [17]:
def weighter(seqs):
    for header,seq in seqs:
        yield range(len(seq))

def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    from itertools import tee
    seqs,seqs_=tee(seqs)
    
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden(seqs, shape_type=5, energy_range=35, max_num=3, split_components=False)
    
    from eden.modifier.graph.vertex_attributes import reweight 
    graphs = reweight(graphs,weighter(seqs_))
    
    return graphs

In [18]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'weight', 'colormap':'winter'})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [19]:
def pre_process(data):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    
    from eden.converter.rna.rnasubopt import rnasubopt_to_eden
    graphs = rnasubopt_to_eden(seqs, energy_range=10, max_num=4, split_components=False)
    
    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.trapezoidal_reweighting(graphs,
                                                       high_weight = 1, low_weight = 0.0, 
                                                       high_weight_window_start = 25, high_weight_window_end = 35, 
                                                       low_weight_window_start = 15, low_weight_window_end = 45)
    return graphs

In [20]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'vertex_color':'weight', 'colormap':'summer'})
plot(graphs,opts=my_opt)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [27]:
def pre_process(data):
    
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)
    
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden(seqs, shape_type=5, energy_range=30, max_num=1)
                
    #annotate in node attribute 'type' the incident edges' labels
    from eden.modifier.graph import vertex_attributes
    graphs = vertex_attributes.incident_edge_label(graphs, level = 1, output_attribute = 'type', separator = '.')
    from eden.modifier.graph.structure import contraction, contraction_modifier
    #reduce all 'label' attributes of contracted nodes to a histogram to be written in the 'label' attribute of the resulting graph 
    label_modifier = contraction_modifier(attribute_in='type', attribute_out='label', reduction='set_categorical')
    #reduce all 'weight' attributes of contracted nodes using a sum to be written in the 'weight' attribute of the resulting graph 
    weight_modifier = contraction_modifier(attribute_in='weight', attribute_out='weight', reduction='sum')
    modifiers = [label_modifier, weight_modifier]
    #contract the graph on the 'type' attribute
    graphs = contraction(graphs, contraction_attribute = 'type', modifiers = modifiers, nesting = False)

    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.colorize(graphs, output_attribute = 'level', labels = ['X','A','U','C','G'])
    
    return graphs

In [28]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'size':20, 'vertex_color':'level', 'colormap':'Set3','prog':'sfdp'})
plot(graphs,opts=my_opt, num=3)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418

In [31]:
def pre_process(data):
    
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(data)

    # make a copy of the seqs to make negative data
    from itertools import tee
    seqs,seqs_ = tee(seqs)
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    neg_seqs = seq_to_seq(seqs_, modifier=shuffle_modifier, times=2)

    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden(seqs, shape_type=5, energy_range=30, max_num=1)
    neg_graphs = rnashapes_to_eden(neg_seqs, shape_type=5, energy_range=30, max_num=1)
    
    #learn a predictive model
    from eden.graph import Vectorizer
    vectorizer=Vectorizer(3)
    graphs,graphs_ = tee(graphs)
    from eden.util import fit
    estimator = fit(graphs_,neg_graphs, vectorizer)
    #annotate
    graphs = vectorizer.annotate(graphs, estimator=estimator)
    # use importance as a binary label in 'type'
    from eden.modifier.graph.vertex_attributes import discretize
    graphs = discretize(graph_list = graphs, output_attribute = 'type', input_attribute='importance', interval=1)

    from eden.modifier.graph.structure import contraction, contraction_modifier
    #reduce all 'label' attributes of contracted nodes to a histogram to be written in the 'label' attribute of the resulting graph 
    label_modifier = contraction_modifier(attribute_in='type', attribute_out='label', reduction='set_categorical')
    modifiers = [label_modifier]
    #contract the graph on the 'type' attribute
    graphs = contraction(graphs, contraction_attribute = 'type', modifiers = modifiers, nesting = True)

    from eden.modifier.graph import vertex_attributes 
    graphs = vertex_attributes.colorize(graphs, output_attribute = 'level', labels = ['X','A','U','C','G'])
    
    return graphs

In [32]:
graphs = pre_process(rfam_uri(rfam_id))
my_opt=dict(opts)
my_opt.update({'size':20, 'vertex_color':'importance', 'colormap':'autumn', 'invert_colormap':True, 'prog':'sfdp', 'size_x_to_y_ratio':2})
plot(graphs,opts=my_opt, num=3)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF02275/alignment?acc=RF02275&format=fastau&download=0 HTTP/1.1" 200 3418
Positive data: Instances: 33 ; Features: 1048577 with an avg of 1230 features per instance
Negative data: Instances: 66 ; Features: 1048577 with an avg of 1231 features per instance
Elapsed time: 4.2 secs

In [ ]: