prepare a function that composes all design actions over the original data
In [1]:
import itertools
def join_pre_processes( data, pre_processes, weights):
if hasattr( data, '__iter__' ):
iterable = data
else: #if not then process url or file with fasta_to_fasta
from eden.modifier.fasta import fasta_to_fasta
iterable = fasta_to_fasta( data )
from eden import util
return util.join_pre_processes(iterable, pre_processes=pre_processes, weights=weights)
design pre processing functions
1 suboptimal structures with rnashapes
In [2]:
def pre_process_rnashapes( iterable ):
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(iterable)
from eden.converter.rna.rnashapes import rnashapes_to_eden
graphs = rnashapes_to_eden( seqs, shape_type = 5, energy_range = 35, max_num = 3 )
return graphs
2 sequence information
In [3]:
def pre_process_fasta( iterable ):
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(iterable)
from eden.converter.fasta import sequence_to_eden
graphs = sequence_to_eden( seqs )
return graphs
3 structure information
In [4]:
def pre_process_structure( iterable ):
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(iterable)
from eden.converter.rna.rnashapes_struct import rnashapes_struct_to_eden
graphs = rnashapes_struct_to_eden(seqs, energy=True, shape=True, shape_type=5, energy_range=35, max_num=3)
return graphs
4 condensed information via contraction
In [5]:
def pre_process_contraction( iterable ):
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence(iterable)
from eden.converter.rna.rnashapes import rnashapes_to_eden
graphs = rnashapes_to_eden( seqs, shape_type=5, energy_range=35, max_num=3 )
#annotate in node attribute 'type' the incident edges' labels
from eden.modifier.graph import vertex_attributes
graphs = vertex_attributes.incident_edge_label(graphs, level=1, output_attribute='type', separator='.')
from eden.modifier.graph.structure import contraction, contraction_modifier
label_modifier = contraction_modifier(attribute_in='type', attribute_out='label', reduction='set_categorical')
#reduce all 'weight' attributes of contracted nodes using a sum to be written in the 'weight' attribute of the resulting graph
weight_modifier = contraction_modifier(attribute_in='weight', attribute_out='weight', reduction='sum')
modifiers = [label_modifier, weight_modifier]
#contract the graph on the 'type' attribute
graphs = contraction(graphs, contraction_attribute='type', modifiers=modifiers)
return graphs
combine them in a single converter
In [6]:
def pre_process( data ):
weights = [0.1, 0.5, 0.1, 0.3]
pre_processes = [pre_process_structure, pre_process_rnashapes, pre_process_fasta, pre_process_contraction]
return join_pre_processes( data, pre_processes, weights)
acquire data from online resource
In [7]:
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
return '%s.fa'%(family_id)
In [8]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA
use EDeN vectorizer to transform the graph instances into sparse vectors
In [10]:
def describe(X):
print 'Instances: %d ; Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
def list_model(uri, pre_process):
from eden.graph import ListVectorizer
vectorizer = ListVectorizer( complexity=1 )
graphs_list, weights = pre_process( uri )
X1 = vectorizer.transform( graphs_list, weights=weights )
describe(X1)
from eden.modifier.fasta import fasta_to_fasta, shuffle_modifier
graphs_list, weights = pre_process( fasta_to_fasta( uri , modifier=shuffle_modifier, times=2, order=2) )
X2 = vectorizer.transform( graphs_list, weights=weights )
describe(X2)
from eden.util import fit_estimator
estimator = fit_estimator( positive_data_matrix=X1, negative_data_matrix=X2, cv=10 )
return estimator, vectorizer
In [12]:
%%time
estimator_str, vectorizer_str = list_model(rfam_uri( rfam_id ), pre_process)