In [20]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [21]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [22]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [23]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [24]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [25]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [26]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [27]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=50
train_test_split=0.5
n_iter=8
times=4
n_jobs=8

BinaryClassificationModel with Default Parameters


In [28]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [29]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               n_iter=1,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.45833187  0.46905387  0.64718992  0.61538876  0.47578163  0.45277172
  0.45216319  0.17156218]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [180  62 198  63 106  67  20 153]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.59115381  0.54069415  0.44542761  0.50605515  0.83524811  0.17475943
  0.82496197  0.66282405]
n_iter is 1: switching to default parameters
Saved current best model in eden_model_RF00005
CPU times: user 4.11 s, sys: 686 ms, total: 4.8 s
Wall time: 7.88 s

In [30]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-08
      eta0: 0.0001
  l1_ratio: 0.458331867897
learning_rate: invscaling
      loss: hinge
    n_iter: 180
    n_jobs: 8
   penalty: l1
   power_t: 0.591153810828

Classifier:
SGDClassifier(alpha=1e-08, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.45833186789726621,
       learning_rate='invscaling', loss='hinge', n_iter=180, n_jobs=8,
       penalty='l1', power_t=0.59115381082770557, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 824 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       1.00      0.02      0.04       100
          1       0.20      1.00      0.34        25

avg / total       0.84      0.22      0.10       125

APR: 0.648
ROC: 0.778
CPU times: user 2.81 s, sys: 452 ms, total: 3.27 s
Wall time: 6.52 s

BinaryClassificationModel with optimisation


In [31]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [32]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.63439113  0.22181719  0.75253848  0.63675579  0.6930968   0.43530866
  0.27201489  0.78718222]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [137 146  79  66  79 141  10 168]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.4449138   0.69399323  0.61329442  0.15718889  0.8751706   0.89029824
  0.3707164   0.43369971]
iteration: (1/5) 1/8 score (roc_auc): 0.724 (0.910 +- 0.093)


	Iteration: 1/8 (after 7.3 sec; 0:00:07.319340)
Best score (roc_auc): 0.724 (0.910 +- 0.093)

Data:
Instances: 125 ; Features: 1048577 with an avg of 846 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.001
      eta0: 0.0001
  l1_ratio: 0.636755794178
learning_rate: optimal
      loss: perceptron
    n_iter: 66
    n_jobs: 8
   penalty: l2
   power_t: 0.370716399179
iteration: (2/5) 1/8 score (roc_auc): 0.393 (0.638 +- 0.122)
iteration: (3/5) 1/8 score (roc_auc): 0.714 (0.890 +- 0.088)
iteration: (4/5) 1/8 score (roc_auc): 0.648 (0.868 +- 0.110)
iteration: (5/5) 1/8 score (roc_auc): 0.697 (0.904 +- 0.104)
iteration: (1/5) 2/8 score (roc_auc): 0.675 (0.848 +- 0.086)
iteration: (2/5) 2/8 score (roc_auc): 0.654 (0.844 +- 0.095)
iteration: (3/5) 2/8 score (roc_auc): 0.402 (0.518 +- 0.058)
iteration: (4/5) 2/8 score (roc_auc): 0.172 (0.352 +- 0.090)
iteration: (5/5) 2/8 score (roc_auc): 0.739 (0.876 +- 0.068)


	Iteration: 2/8 (after 23.5 sec; 0:00:23.514244)
Best score (roc_auc): 0.739 (0.876 +- 0.068)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1221 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 20
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-08
      eta0: 0.01
  l1_ratio: 0.272014888036
learning_rate: constant
      loss: squared_hinge
    n_iter: 168
    n_jobs: 8
   penalty: l2
   power_t: 0.370716399179
iteration: (1/5) 3/8 score (roc_auc): 0.654 (0.844 +- 0.095)
iteration: (2/5) 3/8 score (roc_auc): 0.766 (0.858 +- 0.046)


	Iteration: 3/8 (after 30.7 sec; 0:00:30.677228)
Best score (roc_auc): 0.766 (0.858 +- 0.046)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1220 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-07
      eta0: 0.001
  l1_ratio: 0.272014888036
learning_rate: optimal
      loss: squared_hinge
    n_iter: 137
    n_jobs: 8
   penalty: l2
   power_t: 0.693993232338
iteration: (3/5) 3/8 score (roc_auc): 0.543 (0.802 +- 0.129)
iteration: (4/5) 3/8 score (roc_auc): 0.684 (0.850 +- 0.083)
iteration: (5/5) 3/8 score (roc_auc): 0.675 (0.850 +- 0.087)
iteration: (1/5) 4/8 score (roc_auc): 0.364 (0.506 +- 0.071)
iteration: (2/5) 4/8 score (roc_auc): 0.628 (0.848 +- 0.110)
iteration: (3/5) 4/8 score (roc_auc): -0.079 (0.132 +- 0.106)
iteration: (4/5) 4/8 score (roc_auc): -0.067 (0.138 +- 0.102)
iteration: (5/5) 4/8 score (roc_auc): 0.689 (0.870 +- 0.091)


	Parameters range:

Pre_processor:
energy_range: [30, 20, 30]
   max_num: [3, 1, 1]
shape_type: [4, 5, 5]

Vectorizer:
complexity: [2, 3, 3]

Estimator:
     alpha: [0.001, 1e-08, 1e-07]
      eta0: [0.0001, 0.01, 0.001]
  l1_ratio: [0.63675579417838901, 0.27201488803636775, 0.27201488803636775]
learning_rate: ['optimal', 'constant', 'optimal']
      loss: ['perceptron', 'squared_hinge', 'squared_hinge']
    n_iter: [66, 168, 137]
    n_jobs: [8, 8, 8]
   penalty: ['l2', 'l2', 'l2']
   power_t: [0.37071639917922394, 0.37071639917922394, 0.69399323233787713]
iteration: (1/5) 5/8 score (roc_auc): 0.755 (0.880 +- 0.063)
iteration: (2/5) 5/8 score (roc_auc): 0.707 (0.864 +- 0.079)
iteration: (3/5) 5/8 score (roc_auc): 0.755 (0.880 +- 0.063)
iteration: (4/5) 5/8 score (roc_auc): 0.618 (0.864 +- 0.123)
iteration: (5/5) 5/8 score (roc_auc): 0.707 (0.864 +- 0.079)
iteration: (1/5) 6/8 score (roc_auc): 0.586 (0.828 +- 0.121)
iteration: (2/5) 6/8 score (roc_auc): 0.592 (0.834 +- 0.121)
iteration: (3/5) 6/8 score (roc_auc): 0.771 (0.910 +- 0.070)


	Iteration: 6/8 (after 57.3 sec; 0:00:57.331718)
Best score (roc_auc): 0.771 (0.910 +- 0.070)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.001
      eta0: 0.001
  l1_ratio: 0.272014888036
learning_rate: constant
      loss: perceptron
    n_iter: 66
    n_jobs: 8
   penalty: l2
   power_t: 0.693993232338
iteration: (4/5) 6/8 score (roc_auc): 0.774 (0.896 +- 0.061)


	Iteration: 6/8 (after 58.9 sec; 0:00:58.895805)
Best score (roc_auc): 0.774 (0.896 +- 0.061)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.0001
  l1_ratio: 0.636755794178
learning_rate: optimal
      loss: squared_hinge
    n_iter: 168
    n_jobs: 8
   penalty: l2
   power_t: 0.370716399179
iteration: (5/5) 6/8 score (roc_auc): 0.547 (0.818 +- 0.135)
iteration: (1/5) 7/8 score (roc_auc): 0.741 (0.894 +- 0.077)
iteration: (2/5) 7/8 score (roc_auc): 0.727 (0.886 +- 0.079)
iteration: (3/5) 7/8 score (roc_auc): 0.642 (0.872 +- 0.115)
iteration: (4/5) 7/8 score (roc_auc): 0.642 (0.872 +- 0.115)
iteration: (5/5) 7/8 score (roc_auc): 0.700 (0.880 +- 0.090)
iteration: (1/5) 8/8 score (roc_auc): 0.547 (0.818 +- 0.135)
iteration: (2/5) 8/8 score (roc_auc): 0.701 (0.888 +- 0.093)
iteration: (3/5) 8/8 score (roc_auc): 0.698 (0.860 +- 0.081)
iteration: (4/5) 8/8 score (roc_auc): 0.701 (0.888 +- 0.093)
iteration: (5/5) 8/8 score (roc_auc): 0.650 (0.872 +- 0.111)
Saved current best model in eden_model_RF00005
CPU times: user 15.9 s, sys: 4.05 s, total: 19.9 s
Wall time: 1min 11s

In [33]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.0001
  l1_ratio: 0.636755794178
learning_rate: optimal
      loss: squared_hinge
    n_iter: 168
    n_jobs: 8
   penalty: l2
   power_t: 0.370716399179

Classifier:
SGDClassifier(alpha=1e-07, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.63675579417838901,
       learning_rate='optimal', loss='squared_hinge', n_iter=168, n_jobs=8,
       penalty='l2', power_t=0.37071639917922394, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 539 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.91      0.96      0.93       100
          1       0.79      0.60      0.68        25

avg / total       0.88      0.89      0.88       125

APR: 0.846
ROC: 0.931
CPU times: user 988 ms, sys: 281 ms, total: 1.27 s
Wall time: 2.91 s

Models can be reloaded from disk


In [34]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i


0 -1.97101822093e+13
1 -7.35764193893e+12
2 -7.09754592034e+12
3 -4.7882014607e+12
4 -3.37429885621e+12
5 -3.16913100515e+12
6 -1.81069792889e+12
7 -1.55226667126e+12
8 -1.25202997698e+12
9 -797510914496.0
10 2.40676296741e+12
11 2.69653383959e+12
12 3.62637825702e+12
13 5.79260195555e+12
14 6.00823840975e+12
15 6.09694460939e+12
16 9.26065405415e+12
17 1.04129183199e+13
18 1.1657460466e+13
19 1.1934315022e+13
20 1.21041875587e+13
21 1.21574725905e+13
22 1.24561293146e+13
23 1.42181635901e+13
24 1.53622586791e+13
25 1.57535802871e+13
26 1.59110990495e+13
27 1.62549991833e+13
28 2.11702970663e+13
29 2.13733146274e+13
30 2.54813724991e+13
31 2.68358031418e+13
32 3.06106497162e+13
33 3.49245750265e+13
34 3.73941421185e+13
35 3.7468913099e+13
36 3.79534065309e+13
37 3.80124335864e+13
38 3.81243298545e+13
39 3.88708629459e+13
40 4.11960575865e+13
41 4.1222841921e+13
42 4.31545695103e+13
43 4.32748660092e+13
44 4.68674479294e+13
45 4.77337480834e+13
46 4.82046871131e+13
47 4.91709038027e+13
48 5.33770665814e+13
49 5.35047714561e+13

ActiveLearningBinaryClassificationModel


In [35]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [36]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [10, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.41498197  0.66027452  0.4106494   0.74756393  0.78836933  0.41586935
  0.19717904  0.1883821 ]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [38 39 18 89 16 92 41 37]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.4883884   0.3461041   0.78654708  0.51250154  0.34585254  0.54596373
  0.56011454  0.23687811]
iteration: (1/5) 1/8 score (roc_auc): 0.380 (0.800 +- 0.210)


	Iteration: 1/8 (after 36.1 sec; 0:00:36.143424)
Best score (roc_auc): 0.380 (0.800 +- 0.210)

Data:
Instances: 35 ; Features: 1048577 with an avg of 892 features per instance
class: 1 count:25 (0.71)	class: -1 count:10 (0.29)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-05
      eta0: 0.01
  l1_ratio: 0.788369330008
learning_rate: constant
      loss: hinge
    n_iter: 16
    n_jobs: 8
   penalty: l2
   power_t: 0.488388396988
iteration: (2/5) 1/8 score (roc_auc): 0.380 (0.800 +- 0.210)
iteration: (3/5) 1/8 score (roc_auc): 0.237 (0.780 +- 0.271)
iteration: (4/5) 1/8 score (roc_auc): -0.243 (0.220 +- 0.232)
iteration: (5/5) 1/8 score (roc_auc): 0.380 (0.800 +- 0.210)
iteration: (1/5) 2/8 score (roc_auc): 0.559 (0.814 +- 0.127)


	Iteration: 2/8 (after 90.3 sec; 0:01:30.296692)
Best score (roc_auc): 0.559 (0.814 +- 0.127)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1221 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 0.001
      eta0: 0.01
  l1_ratio: 0.197179038734
learning_rate: constant
      loss: log
    n_iter: 92
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.512501539392
iteration: (2/5) 2/8 score (roc_auc): 0.666 (0.850 +- 0.092)


	Iteration: 2/8 (after 91.1 sec; 0:01:31.129138)
Best score (roc_auc): 0.666 (0.850 +- 0.092)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1221 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.747563932034
learning_rate: invscaling
      loss: squared_hinge
    n_iter: 18
    n_jobs: 8
   penalty: l1
   power_t: 0.236878112397
iteration: (3/5) 2/8 score (roc_auc): 0.314 (0.450 +- 0.068)
iteration: (4/5) 2/8 score (roc_auc): 0.719 (0.884 +- 0.082)


	Iteration: 2/8 (after 93.2 sec; 0:01:33.221674)
Best score (roc_auc): 0.719 (0.884 +- 0.082)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1221 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-06
      eta0: 0.0001
  l1_ratio: 0.415869346955
learning_rate: optimal
      loss: log
    n_iter: 39
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.34585253909
iteration: (5/5) 2/8 score (roc_auc): 0.665 (0.846 +- 0.091)
iteration: (1/5) 3/8 score (roc_auc): -0.071 (0.120 +- 0.096)
iteration: (2/5) 3/8 score (roc_auc): 0.736 (0.930 +- 0.097)


	Iteration: 3/8 (after 155.3 sec; 0:02:35.253464)
Best score (roc_auc): 0.736 (0.930 +- 0.097)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1876 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-07
      eta0: 0.001
  l1_ratio: 0.415869346955
learning_rate: optimal
      loss: squared_hinge
    n_iter: 41
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.34585253909
iteration: (3/5) 3/8 score (roc_auc): 0.648 (0.868 +- 0.110)
iteration: (4/5) 3/8 score (roc_auc): -0.083 (0.120 +- 0.101)
iteration: (5/5) 3/8 score (roc_auc): 0.677 (0.880 +- 0.101)
iteration: (1/5) 4/8 score (roc_auc): 0.697 (0.892 +- 0.097)
iteration: (2/5) 4/8 score (roc_auc): 0.706 (0.894 +- 0.094)
iteration: (3/5) 4/8 score (roc_auc): 0.738 (0.900 +- 0.081)


	Iteration: 4/8 (after 198.4 sec; 0:03:18.395552)
Best score (roc_auc): 0.738 (0.900 +- 0.081)

Data:
Instances: 125 ; Features: 1048577 with an avg of 784 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-08
      eta0: 0.0001
  l1_ratio: 0.41064939684
learning_rate: optimal
      loss: log
    n_iter: 38
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.488388396988
iteration: (4/5) 4/8 score (roc_auc): -0.092 (0.110 +- 0.101)
iteration: (5/5) 4/8 score (roc_auc): -0.087 (0.108 +- 0.097)


	Parameters range:

Pre_processor:
energy_range: [30, 10, 10, 10, 10, 10]
   max_num: [3, 1, 1, 1, 3, 3]
shape_type: [5, 5, 5, 5, 5, 5]

Vectorizer:
complexity: [2, 3, 3, 3, 3, 2]

Estimator:
     alpha: [1e-05, 0.001, 1e-07, 1e-06, 1e-07, 1e-08]
      eta0: [0.01, 0.01, 0.01, 0.0001, 0.001, 0.0001]
  l1_ratio: [0.78836933000807574, 0.19717903873376202, 0.74756393203358895, 0.41586934695469335, 0.41586934695469335, 0.41064939683992552]
learning_rate: ['constant', 'constant', 'invscaling', 'optimal', 'optimal', 'optimal']
      loss: ['hinge', 'log', 'squared_hinge', 'log', 'squared_hinge', 'log']
    n_iter: [16, 92, 18, 39, 41, 38]
    n_jobs: [8, 8, 8, 8, 8, 8]
   penalty: ['l2', 'elasticnet', 'l1', 'elasticnet', 'elasticnet', 'elasticnet']
   power_t: [0.48838839698808101, 0.51250153939168219, 0.23687811239710896, 0.34585253909009317, 0.34585253909009317, 0.48838839698808101]
iteration: (1/5) 5/8 score (roc_auc): 0.699 (0.888 +- 0.095)
iteration: (2/5) 5/8 score (roc_auc): 0.736 (0.880 +- 0.072)
iteration: (3/5) 5/8 score (roc_auc): 0.673 (0.848 +- 0.087)
iteration: (4/5) 5/8 score (roc_auc): 0.704 (0.888 +- 0.092)
iteration: (5/5) 5/8 score (roc_auc): 0.665 (0.846 +- 0.090)
iteration: (1/5) 6/8 score (roc_auc): 0.653 (0.904 +- 0.125)
iteration: (2/5) 6/8 score (roc_auc): 0.681 (0.856 +- 0.087)
iteration: (3/5) 6/8 score (roc_auc): 0.774 (0.896 +- 0.061)


	Iteration: 6/8 (after 289.8 sec; 0:04:49.750566)
Best score (roc_auc): 0.774 (0.896 +- 0.061)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-05
      eta0: 0.01
  l1_ratio: 0.747563932034
learning_rate: optimal
      loss: hinge
    n_iter: 92
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.488388396988
iteration: (4/5) 6/8 score (roc_auc): 0.773 (0.904 +- 0.065)
iteration: (5/5) 6/8 score (roc_auc): 0.717 (0.874 +- 0.079)
iteration: (1/5) 7/8 score (roc_auc): 0.710 (0.870 +- 0.080)
iteration: (2/5) 7/8 score (roc_auc): 0.778 (0.892 +- 0.057)


	Iteration: 7/8 (after 324.8 sec; 0:05:24.847134)
Best score (roc_auc): 0.778 (0.892 +- 0.057)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-06
      eta0: 0.001
  l1_ratio: 0.788369330008
learning_rate: optimal
      loss: log
    n_iter: 41
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.488388396988
iteration: (3/5) 7/8 score (roc_auc): 0.764 (0.906 +- 0.071)
iteration: (4/5) 7/8 score (roc_auc): 0.661 (0.880 +- 0.109)
iteration: (5/5) 7/8 score (roc_auc): 0.729 (0.890 +- 0.081)
iteration: (1/5) 8/8 score (roc_auc): 0.700 (0.858 +- 0.079)
iteration: (2/5) 8/8 score (roc_auc): 0.545 (0.802 +- 0.129)
iteration: (3/5) 8/8 score (roc_auc): 0.681 (0.856 +- 0.087)
iteration: (4/5) 8/8 score (roc_auc): 0.722 (0.892 +- 0.085)
iteration: (5/5) 8/8 score (roc_auc): 0.687 (0.856 +- 0.085)
Saved current best model in eden_model_active_RF00005
CPU times: user 3min 59s, sys: 24.3 s, total: 4min 23s
Wall time: 6min 2s

In [37]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=1e-06, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.78836933000807574,
       learning_rate='optimal', loss='log', n_iter=41, n_jobs=8,
       penalty='elasticnet', power_t=0.48838839698808101,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 538 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.90      0.95      0.92       100
          1       0.74      0.56      0.64        25

avg / total       0.86      0.87      0.87       125

APR: 0.805
ROC: 0.919
CPU times: user 1.13 s, sys: 393 ms, total: 1.52 s
Wall time: 2.58 s

In [38]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i


0 -9.15441193458
1 -3.26274204084
2 -2.92329359638
3 -2.53663040528
4 -2.50864632443
5 -1.51944960973
6 -1.45082184291
7 -0.912452486938
8 -0.893883898788
9 -0.844319188942
10 -0.584092410651
11 1.29428172831
12 1.55886611575
13 2.65707754247
14 3.2153114505
15 3.30005985442
16 3.66643922389
17 4.37081449884
18 4.81857917744
19 4.91645421641
20 5.49344923367
21 5.54164847005
22 6.06949893411
23 6.25015214694
24 6.83042294098
25 7.5396420058
26 7.64529287534
27 8.29028070133
28 8.4935455855
29 8.66314069637
30 9.32892574504
31 11.6503877828
32 14.3215794701
33 14.5877776096
34 15.5673705644
35 16.0083179259
36 17.3348909386
37 19.4410047235
38 19.9094633179
39 20.1682547975
40 20.4469795704
41 21.4644840854
42 21.6500559136
43 21.9604598916
44 22.1846592975
45 22.5251133201
46 23.7537913933
47 25.187098142
48 25.6844746392
49 29.5405464382