In [20]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)
In [21]:
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
return '%s.fa'%(family_id)
In [22]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA
In [23]:
def pre_processor( data, **args):
from eden.converter.rna.rnafold import rnafold_to_eden
graphs = rnafold_to_eden( data, **args )
return graphs
In [24]:
def pre_processor( data, **args):
from eden.converter.rna.rnashapes import rnashapes_to_eden
graphs = rnashapes_to_eden( data, **args )
return graphs
In [25]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()
In [26]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)
In [27]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=50
train_test_split=0.5
n_iter=8
times=4
n_jobs=8
In [28]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
In [29]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor,
estimator=estimator,
vectorizer=vectorizer,
n_jobs=n_jobs,
pre_processor_n_jobs=n_jobs,
n_blocks=5)
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3],
'shape_type':[4,5],
'energy_range':[30, 5,10,20,30,40]}
vectorizer_parameters={'complexity':[2,3]}
estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,0)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"],
'n_jobs':[n_jobs]}
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
n_iter=1,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
In [30]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )
In [31]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
In [32]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor,
estimator=estimator,
vectorizer=vectorizer,
n_jobs=n_jobs,
pre_processor_n_jobs=n_jobs,
n_blocks=5)
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3],
'shape_type':[4,5],
'energy_range':[30, 5,10,20,30,40]}
vectorizer_parameters={'complexity':[2,3]}
estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,0)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"],
'n_jobs':[n_jobs]}
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
max_total_time=-1,
n_iter=n_iter,
n_inner_iter_estimator=5,
cv=5,
score_func=lambda avg_score,std_score : avg_score - std_score * 2,
scoring='roc_auc',
two_steps_optimization=True,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
In [33]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )
Models can be reloaded from disk
In [34]:
from eden.model import ActiveLearningBinaryClassificationModel
model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i
In [35]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
In [36]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor,
estimator=estimator,
vectorizer=vectorizer,
n_jobs=n_jobs,
pre_processor_n_jobs=n_jobs,
n_blocks=8)
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3],
'shape_type':[5],
'energy_range':[10,30]}
vectorizer_parameters={'complexity':[2,3]}
estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,0)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"],
'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
score_func=lambda avg_score,std_score : avg_score - std_score * 2,
scoring='roc_auc',
n_active_learning_iterations=4,
n_iter=n_iter,
size_positive=-1,
size_negative=active_set_size,
cv=5,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
In [37]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )
In [38]:
from eden.model import ActiveLearningBinaryClassificationModel
model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i