In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [2]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

def rfam_uri(family_id):
    return '%s.fa'%(family_id)
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

In [3]:
%%time
from eden.converter.fasta import fasta_to_sequence
pos_seqs = fasta_to_sequence(rfam_uri(rfam_id))
from eden.converter.rna.rnafold import rnafold_to_eden
pre_processor_args={'max_num':3,'shape_type':5,'energy_range':20}
from eden.util import mp_pre_process
pos_graphs = mp_pre_process(pos_seqs, pre_processor=rnafold_to_eden, pre_processor_args=pre_processor_args, n_blocks=5, n_jobs=4)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
CPU times: user 324 ms, sys: 101 ms, total: 425 ms
Wall time: 5.84 s

In [4]:
%%time
from eden.converter.fasta import fasta_to_sequence
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(fasta_to_sequence(rfam_uri(rfam_id)), modifier=shuffle_modifier, times=2)
from eden.converter.rna.rnafold import rnafold_to_eden
from eden.util import mp_pre_process
neg_graphs = mp_pre_process(neg_seqs, pre_processor=rnafold_to_eden, pre_processor_args=pre_processor_args, n_blocks=5, n_jobs=4)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
CPU times: user 812 ms, sys: 180 ms, total: 992 ms
Wall time: 13 s

In [5]:
train_test_split=.7
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(pos_graphs, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(neg_graphs, relative_size=train_test_split)

In [6]:
%%time
from eden.graph import Vectorizer
vectorizer=Vectorizer(complexity=3)
from eden.util import fit
estimator=fit(iterable_pos_train, iterable_neg_train, vectorizer, n_jobs=4, cv=10, n_iter_search=1, random_state=1)


Positive data: Instances: 667 ; Features: 1048577 with an avg of 1246 features per instance
Negative data: Instances: 1335 ; Features: 1048577 with an avg of 1214 features per instance
Elapsed time: 38.0 secs
CPU times: user 20.1 s, sys: 1.95 s, total: 22.1 s
Wall time: 38.1 s

In [7]:
%%time
from eden.util import estimate
estimate(iterable_pos_test, iterable_neg_test, estimator, vectorizer)


Test set
Instances: 860 ; Features: 1048577 with an avg of 1227 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.98      0.97      0.98       573
          1       0.95      0.96      0.96       287

avg / total       0.97      0.97      0.97       860

APR: 0.988
ROC: 0.992
CPU times: user 7 s, sys: 984 ms, total: 7.98 s
Wall time: 12.6 s

In [8]:
rfam_id = 'RF00005' #tRNA
print rfam_uri(rfam_id)


http://rfam.xfam.org/family/RF00005/alignment?acc=RF00005&format=fastau&download=0