In [5]:
def train_ncRNA_model(fname=None, model_fname=None, n_iter=40):
#parameters
times=5
size=100
active_set_size=1000
threshold=1
n_active_learning_iterations=4
train_test_split=0.7
n_jobs=8
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
return '%s.fa'%(family_id)
def pre_processor( data, **args):
from eden.converter.rna.rnashapes import rnashapes_to_eden
graphs = rnashapes_to_eden( data, **args )
return graphs
from eden.graph import Vectorizer
vectorizer = Vectorizer()
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(class_weight='auto', shuffle=True)
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor,
estimator=estimator,
vectorizer=vectorizer,
n_jobs=n_jobs)
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,2,3],
'shape_type':[4,5],
'energy_range':randint(10, 40, size=n_iter)}
vectorizer_parameters={'complexity':[1,2,3]}
estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,0)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"],
'n_jobs':[n_jobs]}
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
n_active_learning_iterations=n_active_learning_iterations,
size_positive=-1,
size_negative=active_set_size,
n_iter=n_iter, cv=3,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
#estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test )
def test_ncRNA_model(fname=None, model_fname=None):
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel()
model.load(model_fname)
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
predictions= model.decision_function( seqs_ )
from itertools import izip
seqs,seqs_=tee(seqs)
results = [(p,s) for s,p in izip(seqs_,predictions)]
return results
In [6]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)
In [9]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA
rfam_id = 'RF00015'
rfam_id = 'RF02012'
In [ ]:
rfam_id = 'RF00005' #tRNA
model_fname=rfam_id+'.model'
train_ncRNA_model(fname=rfam_id, model_fname=model_fname, n_iter=20)
results = test_ncRNA_model(fname=rfam_id, model_fname=model_fname)
In [ ]:
from ipy_table import make_table, apply_theme, set_global_style
mat=[('ID','Conf')]
for item in sorted(results, reverse=True):
(confidence,(header,seq))=item
mat.append((header+' '+seq,confidence))
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%0.3e')