RNA family characterization

Application scenario: we want to build a predictive model for a set of ncRNAs using multiple converters, each with a different weight: e.g. sequence information + structure information + condensed information

prepare a function that composes all design actions over the original data



In [1]:

    
import itertools
def join_pre_processes( data, pre_processes, weights):
    if hasattr( data, '__iter__' ):
        iterable = data
    else: #if not then process url or file with fasta_to_fasta
        from eden.modifier.fasta import fasta_to_fasta
        iterable = fasta_to_fasta( data )
    from eden import util
    return util.join_pre_processes(iterable, pre_processes=pre_processes, weights=weights)

design pre processing functions

1 suboptimal structures with rnashapes



In [2]:

    
def pre_process_rnashapes( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( seqs, shape_type = 5, energy_range = 35, max_num = 3 )
    return graphs

2 sequence information



In [3]:

    
def pre_process_fasta( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.fasta import sequence_to_eden
    graphs = sequence_to_eden( seqs )
    return graphs

3 structure information



In [4]:

    
def pre_process_structure( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.rna.rnashapes_struct import rnashapes_struct_to_eden
    graphs = rnashapes_struct_to_eden(seqs, energy=True, shape=True, shape_type=5, energy_range=35, max_num=3)
    return graphs

4 condensed information via contraction



In [5]:

    
def pre_process_contraction( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( seqs, shape_type=5, energy_range=35, max_num=3 )
    
    #annotate in node attribute 'type' the incident edges' labels
    from eden.modifier.graph import vertex_attributes
    graphs = vertex_attributes.incident_edge_label(graphs, level=1, output_attribute='type', separator='.')
    
    from eden.modifier.graph.structure import contraction, contraction_modifier
    label_modifier = contraction_modifier(attribute_in='type', attribute_out='label', reduction='set_categorical')
    
    #reduce all 'weight' attributes of contracted nodes using a sum to be written in the 'weight' attribute of the resulting graph 
    weight_modifier = contraction_modifier(attribute_in='weight', attribute_out='weight', reduction='sum')
    modifiers = [label_modifier, weight_modifier]
    
    #contract the graph on the 'type' attribute
    graphs = contraction(graphs, contraction_attribute='type', modifiers=modifiers)
    
    return graphs

combine them in a single converter



In [6]:

    
def pre_process( data ):
    weights = [0.1, 0.5, 0.1, 0.3]
    pre_processes = [pre_process_structure, pre_process_rnashapes, pre_process_fasta, pre_process_contraction]
    return join_pre_processes( data, pre_processes, weights)

acquire data from online resource



In [7]:

    
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
    return '%s.fa'%(family_id)



In [8]:

    
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

use EDeN vectorizer to transform the graph instances into sparse vectors



In [10]:

    
def describe(X):
    print 'Instances: %d ; Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])
    
def list_model(uri, pre_process):

    from eden.graph import ListVectorizer
    vectorizer = ListVectorizer( complexity=1 )

    graphs_list, weights = pre_process( uri )
    X1 = vectorizer.transform( graphs_list, weights=weights )
    describe(X1)
    
    from eden.modifier.fasta import fasta_to_fasta, shuffle_modifier
    graphs_list, weights = pre_process( fasta_to_fasta( uri , modifier=shuffle_modifier, times=2, order=2) )
    X2 = vectorizer.transform( graphs_list, weights=weights )
    describe(X2)
    
    from eden.util import fit_estimator
    estimator = fit_estimator( positive_data_matrix=X1, negative_data_matrix=X2, cv=10 )
    
    return estimator, vectorizer



In [12]:

    
%%time
estimator_str, vectorizer_str = list_model(rfam_uri( rfam_id ), pre_process)









    



Instances: 954 ; Features: 1048577 with an avg of 320 features per instance
Instances: 1908 ; Features: 1048577 with an avg of 316 features per instance
Classifier:
SGDClassifier(alpha=0.00023633482673, average=True, class_weight='auto',
       epsilon=0.1, eta0=0.391608105253, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=45, n_jobs=-1,
       penalty='l2', power_t=1.03880139312, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.983 +- 0.009
           precision: 0.979 +- 0.013
              recall: 0.969 +- 0.024
                  f1: 0.974 +- 0.014
   average_precision: 0.996 +- 0.003
             roc_auc: 0.997 +- 0.003
--------------------------------------------------------------------------------
CPU times: user 6min 32s, sys: 53.9 s, total: 7min 25s
Wall time: 12min 56s