In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [2]:
def rfam_uri(family_id):
    return '%s.fa'%(family_id)
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

In [3]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [5]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [6]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [7]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [8]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=50
train_test_split=0.5
n_iter=8
times=4
n_jobs=8

BinaryClassificationModel with Default Parameters


In [9]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476

In [10]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               n_iter=1,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.48852528  0.88377663  0.63612889  0.50549644  0.69755188  0.29498161
  0.58748642  0.52251176]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [114 161  56 128  42 134 183 129]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.78188655  0.34545935  0.59407691  0.97009502  0.96806257  0.18117636
  0.14489189  0.77858605]
n_iter is 1: switching to default parameters
Saved current best model in eden_model_RF00005
CPU times: user 4.01 s, sys: 503 ms, total: 4.51 s
Wall time: 6.82 s

In [11]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-08
      eta0: 0.0001
  l1_ratio: 0.488525281467
learning_rate: invscaling
      loss: hinge
    n_iter: 114
    n_jobs: 8
   penalty: l1
   power_t: 0.781886548686

Classifier:
SGDClassifier(alpha=1e-08, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.48852528146731278,
       learning_rate='invscaling', loss='hinge', n_iter=114, n_jobs=8,
       penalty='l1', power_t=0.78188654868601615, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 824 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.87      0.97      0.92       100
          1       0.77      0.40      0.53        25

avg / total       0.85      0.86      0.84       125

APR: 0.629
ROC: 0.791
CPU times: user 3.01 s, sys: 578 ms, total: 3.59 s
Wall time: 5.73 s

BinaryClassificationModel with optimisation


In [12]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476

In [13]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.35601244  0.19994856  0.23091638  0.16580341  0.22365208  0.63863344
  0.21982035  0.67858568]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [ 15 154  22  66  27 152 158 112]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.37531144  0.25617309  0.81085078  0.23892281  0.81009571  0.81703234
  0.20804395  0.17481839]
iteration: (1/5) 1/8 score (roc_auc): 0.013 (0.294 +- 0.141)


	Iteration: 1/8 (after 8.9 sec; 0:00:08.909029)
Best score (roc_auc): 0.013 (0.294 +- 0.141)

Data:
Instances: 125 ; Features: 1048577 with an avg of 846 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.01
      eta0: 0.001
  l1_ratio: 0.21982035442
learning_rate: invscaling
      loss: squared_hinge
    n_iter: 158
    n_jobs: 8
   penalty: l1
   power_t: 0.810850780143
iteration: (2/5) 1/8 score (roc_auc): 0.846 (0.940 +- 0.047)


	Iteration: 1/8 (after 11.9 sec; 0:00:11.943513)
Best score (roc_auc): 0.846 (0.940 +- 0.047)

Data:
Instances: 125 ; Features: 1048577 with an avg of 846 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-05
      eta0: 0.0001
  l1_ratio: 0.199948555451
learning_rate: optimal
      loss: squared_hinge
    n_iter: 158
    n_jobs: 8
   penalty: l1
   power_t: 0.208043949458
iteration: (3/5) 1/8 score (roc_auc): 0.609 (0.860 +- 0.126)
iteration: (4/5) 1/8 score (roc_auc): 0.703 (0.892 +- 0.095)
iteration: (5/5) 1/8 score (roc_auc): 0.499 (0.560 +- 0.030)
iteration: (1/5) 2/8 score (roc_auc): 0.680 (0.886 +- 0.103)
iteration: (2/5) 2/8 score (roc_auc): 0.698 (0.890 +- 0.096)
iteration: (3/5) 2/8 score (roc_auc): 0.668 (0.878 +- 0.105)
iteration: (4/5) 2/8 score (roc_auc): 0.772 (0.912 +- 0.070)
iteration: (5/5) 2/8 score (roc_auc): 0.768 (0.908 +- 0.070)
iteration: (1/5) 3/8 score (roc_auc): 0.690 (0.892 +- 0.101)
iteration: (2/5) 3/8 score (roc_auc): 0.670 (0.886 +- 0.108)
iteration: (3/5) 3/8 score (roc_auc): -0.067 (0.130 +- 0.098)
iteration: (4/5) 3/8 score (roc_auc): 0.668 (0.884 +- 0.108)
iteration: (5/5) 3/8 score (roc_auc): 0.095 (0.284 +- 0.094)
iteration: (1/5) 4/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (2/5) 4/8 score (roc_auc): 0.737 (0.898 +- 0.080)
iteration: (3/5) 4/8 score (roc_auc): 0.807 (0.922 +- 0.057)
iteration: (4/5) 4/8 score (roc_auc): -0.102 (0.088 +- 0.095)
iteration: (5/5) 4/8 score (roc_auc): 0.762 (0.910 +- 0.074)


	Parameters range:

Pre_processor:
energy_range: [30, 30]
   max_num: [3, 3]
shape_type: [4, 4]

Vectorizer:
complexity: [2, 2]

Estimator:
     alpha: [0.01, 1e-05]
      eta0: [0.001, 0.0001]
  l1_ratio: [0.21982035442030839, 0.19994855545106952]
learning_rate: ['invscaling', 'optimal']
      loss: ['squared_hinge', 'squared_hinge']
    n_iter: [158, 158]
    n_jobs: [8, 8]
   penalty: ['l1', 'l1']
   power_t: [0.81085078014295564, 0.20804394945799151]
iteration: (1/5) 5/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (2/5) 5/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (3/5) 5/8 score (roc_auc): 0.704 (0.904 +- 0.100)
iteration: (4/5) 5/8 score (roc_auc): 0.730 (0.898 +- 0.084)
iteration: (5/5) 5/8 score (roc_auc): 0.740 (0.900 +- 0.080)
iteration: (1/5) 6/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (2/5) 6/8 score (roc_auc): -0.064 (0.116 +- 0.090)
iteration: (3/5) 6/8 score (roc_auc): 0.636 (0.876 +- 0.120)
iteration: (4/5) 6/8 score (roc_auc): -0.064 (0.116 +- 0.090)
iteration: (5/5) 6/8 score (roc_auc): 0.740 (0.900 +- 0.080)
iteration: (1/5) 7/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (2/5) 7/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (3/5) 7/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (4/5) 7/8 score (roc_auc): 0.636 (0.876 +- 0.120)
iteration: (5/5) 7/8 score (roc_auc): 0.636 (0.876 +- 0.120)
iteration: (1/5) 8/8 score (roc_auc): 0.704 (0.904 +- 0.100)
iteration: (2/5) 8/8 score (roc_auc): 0.636 (0.876 +- 0.120)
iteration: (3/5) 8/8 score (roc_auc): -0.084 (0.086 +- 0.085)
iteration: (4/5) 8/8 score (roc_auc): -0.129 (0.160 +- 0.144)
iteration: (5/5) 8/8 score (roc_auc): -0.129 (0.160 +- 0.144)
Saved current best model in eden_model_RF00005
CPU times: user 27.7 s, sys: 5.55 s, total: 33.2 s
Wall time: 1min 48s

In [14]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-05
      eta0: 0.0001
  l1_ratio: 0.199948555451
learning_rate: optimal
      loss: squared_hinge
    n_iter: 158
    n_jobs: 8
   penalty: l1
   power_t: 0.208043949458

Classifier:
SGDClassifier(alpha=1e-05, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.19994855545106952,
       learning_rate='optimal', loss='squared_hinge', n_iter=158, n_jobs=8,
       penalty='l1', power_t=0.20804394945799151, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 824 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.90      0.98      0.94       100
          1       0.88      0.56      0.68        25

avg / total       0.89      0.90      0.89       125

APR: 0.779
ROC: 0.894
CPU times: user 3.21 s, sys: 467 ms, total: 3.68 s
Wall time: 6.17 s

Models can be reloaded from disk


In [15]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
0 -5.04564984237e+12
1 -4.2921612619e+12
2 -3.61961490129e+12
3 -3.16373317353e+12
4 -2.63147792156e+12
5 -2.4050004638e+12
6 -2.3843591992e+12
7 -1.98219369157e+12
8 -859048813181.0
9 -713618763276.0
10 -372807979695.0
11 344830675597.0
12 389672318171.0
13 397716279779.0
14 614670709485.0
15 1.18030254367e+12
16 1.33368833912e+12
17 1.61529373908e+12
18 2.11426155913e+12
19 2.12774792842e+12
20 2.39735671078e+12
21 2.51889448589e+12
22 2.73944461931e+12
23 3.44622081804e+12
24 3.67108083035e+12
25 3.76452517843e+12
26 3.89242741108e+12
27 4.1988265173e+12
28 4.65296905379e+12
29 4.8842315226e+12
30 5.12787215317e+12
31 5.38470784417e+12
32 5.4644538409e+12
33 6.86805004123e+12
34 8.19933128641e+12
35 1.10745610793e+13
36 1.13267041435e+13
37 1.1338241008e+13
38 1.18026842203e+13
39 1.1946936046e+13
40 1.21205439821e+13
41 1.27601975641e+13
42 1.28327947574e+13
43 1.3088539367e+13
44 1.32355232284e+13
45 1.38884849545e+13
46 1.45890740551e+13
47 1.56604021071e+13
48 1.60545358373e+13
49 1.71810439429e+13

ActiveLearningBinaryClassificationModel


In [16]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476

In [17]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [10, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.56669063  0.80652926  0.35336519  0.66073686  0.89983879  0.55953837
  0.74666276  0.5771423 ]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [47 19 51 13 91 28 60 18]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.97540286  0.454284    0.53392293  0.44409479  0.52466934  0.57968666
  0.90622519  0.4568082 ]
iteration: (1/5) 1/8 score (roc_auc): 0.326 (0.733 +- 0.204)


	Iteration: 1/8 (after 21.7 sec; 0:00:21.652055)
Best score (roc_auc): 0.326 (0.733 +- 0.204)

Data:
Instances: 42 ; Features: 1048577 with an avg of 546 features per instance
class: 1 count:25 (0.60)	class: -1 count:17 (0.40)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.746662757941
learning_rate: invscaling
      loss: modified_huber
    n_iter: 13
    n_jobs: 8
   penalty: l2
   power_t: 0.524669336646
iteration: (2/5) 1/8 score (roc_auc): 0.443 (0.770 +- 0.163)


	Iteration: 1/8 (after 22.3 sec; 0:00:22.299703)
Best score (roc_auc): 0.443 (0.770 +- 0.163)

Data:
Instances: 42 ; Features: 1048577 with an avg of 546 features per instance
class: 1 count:25 (0.60)	class: -1 count:17 (0.40)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.0001
      eta0: 0.0001
  l1_ratio: 0.566690633962
learning_rate: optimal
      loss: perceptron
    n_iter: 18
    n_jobs: 8
   penalty: l2
   power_t: 0.444094793409
iteration: (3/5) 1/8 score (roc_auc): 0.476 (0.733 +- 0.129)


	Iteration: 1/8 (after 23.0 sec; 0:00:22.956146)
Best score (roc_auc): 0.476 (0.733 +- 0.129)

Data:
Instances: 42 ; Features: 1048577 with an avg of 546 features per instance
class: 1 count:25 (0.60)	class: -1 count:17 (0.40)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.001
      eta0: 0.001
  l1_ratio: 0.566690633962
learning_rate: optimal
      loss: modified_huber
    n_iter: 18
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.906225191571
iteration: (4/5) 1/8 score (roc_auc): 0.424 (0.777 +- 0.176)
iteration: (5/5) 1/8 score (roc_auc): 0.441 (0.740 +- 0.150)

Failed iteration: 2/8 (at 34.1 sec; 0:00:34.121283)
Common base class for all non-exit exceptions.
No instances found that satisfy constraints
Failed with the following setting:

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.001
  l1_ratio: 0.746662757941
learning_rate: optimal
      loss: squared_hinge
    n_iter: 47
    n_jobs: 8
   penalty: l2
   power_t: 0.454284003053
...continuing
iteration: (1/5) 2/8 score (roc_auc): -0.129 (0.223 +- 0.176)
iteration: (2/5) 2/8 score (roc_auc): 0.424 (0.777 +- 0.176)
iteration: (3/5) 2/8 score (roc_auc): 0.363 (0.763 +- 0.200)
iteration: (4/5) 2/8 score (roc_auc): 0.268 (0.663 +- 0.198)
iteration: (5/5) 2/8 score (roc_auc): 0.363 (0.763 +- 0.200)
iteration: (1/5) 3/8 score (roc_auc): 0.763 (0.930 +- 0.084)


	Iteration: 3/8 (after 70.4 sec; 0:01:10.436770)
Best score (roc_auc): 0.763 (0.930 +- 0.084)

Data:
Instances: 125 ; Features: 1048577 with an avg of 945 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-06
      eta0: 0.0001
  l1_ratio: 0.559538369024
learning_rate: optimal
      loss: log
    n_iter: 19
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.524669336646
iteration: (2/5) 3/8 score (roc_auc): 0.577 (0.860 +- 0.142)
iteration: (3/5) 3/8 score (roc_auc): 0.583 (0.858 +- 0.137)
iteration: (4/5) 3/8 score (roc_auc): 0.649 (0.890 +- 0.120)
iteration: (5/5) 3/8 score (roc_auc): -0.131 (0.110 +- 0.120)
iteration: (1/5) 4/8 score (roc_auc): -0.133 (0.114 +- 0.124)
iteration: (2/5) 4/8 score (roc_auc): 0.639 (0.886 +- 0.124)
iteration: (3/5) 4/8 score (roc_auc): 0.688 (0.898 +- 0.105)
iteration: (4/5) 4/8 score (roc_auc): 0.734 (0.924 +- 0.095)
iteration: (5/5) 4/8 score (roc_auc): 0.708 (0.924 +- 0.108)


	Parameters range:

Pre_processor:
energy_range: [10, 10, 10, 30]
   max_num: [1, 1, 1, 3]
shape_type: [5, 5, 5, 5]

Vectorizer:
complexity: [2, 2, 2, 2]

Estimator:
     alpha: [1e-07, 0.0001, 0.001, 1e-06]
      eta0: [0.01, 0.0001, 0.001, 0.0001]
  l1_ratio: [0.74666275794065129, 0.56669063396221242, 0.56669063396221242, 0.55953836902446996]
learning_rate: ['invscaling', 'optimal', 'optimal', 'optimal']
      loss: ['modified_huber', 'perceptron', 'modified_huber', 'log']
    n_iter: [13, 18, 18, 19]
    n_jobs: [8, 8, 8, 8]
   penalty: ['l2', 'l2', 'elasticnet', 'elasticnet']
   power_t: [0.5246693366459444, 0.44409479340934155, 0.90622519157082448, 0.5246693366459444]

Failed iteration: 5/8 (at 130.5 sec; 0:02:10.534547)
Common base class for all non-exit exceptions.
No instances found that satisfy constraints
Failed with the following setting:

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.660736855167
learning_rate: optimal
      loss: modified_huber
    n_iter: 51
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.579686659627
...continuing
iteration: (1/5) 5/8 score (roc_auc): 0.585 (0.854 +- 0.135)
iteration: (2/5) 5/8 score (roc_auc): 0.617 (0.894 +- 0.139)
iteration: (3/5) 5/8 score (roc_auc): 0.235 (0.344 +- 0.054)
iteration: (4/5) 5/8 score (roc_auc): 0.607 (0.894 +- 0.143)
iteration: (5/5) 5/8 score (roc_auc): 0.668 (0.872 +- 0.102)
iteration: (1/5) 6/8 score (roc_auc): 0.423 (0.636 +- 0.107)
iteration: (2/5) 6/8 score (roc_auc): 0.474 (0.706 +- 0.116)
iteration: (3/5) 6/8 score (roc_auc): 0.504 (0.750 +- 0.123)
iteration: (4/5) 6/8 score (roc_auc): 0.454 (0.709 +- 0.128)
iteration: (5/5) 6/8 score (roc_auc): 0.466 (0.764 +- 0.149)
iteration: (1/5) 7/8 score (roc_auc): 0.486 (0.782 +- 0.148)
iteration: (2/5) 7/8 score (roc_auc): 0.631 (0.866 +- 0.118)
iteration: (3/5) 7/8 score (roc_auc): 0.684 (0.906 +- 0.111)
iteration: (4/5) 7/8 score (roc_auc): 0.698 (0.888 +- 0.095)
iteration: (5/5) 7/8 score (roc_auc): 0.694 (0.908 +- 0.107)
iteration: (1/5) 8/8 score (roc_auc): 0.706 (0.848 +- 0.071)
iteration: (2/5) 8/8 score (roc_auc): 0.764 (0.894 +- 0.065)


	Iteration: 8/8 (after 213.4 sec; 0:03:33.426954)
Best score (roc_auc): 0.764 (0.894 +- 0.065)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.001
      eta0: 0.0001
  l1_ratio: 0.559538369024
learning_rate: optimal
      loss: modified_huber
    n_iter: 18
    n_jobs: 8
   penalty: l2
   power_t: 0.524669336646
iteration: (3/5) 8/8 score (roc_auc): 0.792 (0.894 +- 0.051)


	Iteration: 8/8 (after 214.2 sec; 0:03:34.238634)
Best score (roc_auc): 0.792 (0.894 +- 0.051)

Data:
Instances: 125 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:25 (0.20)	class: -1 count:100 (0.80)	

	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 0.0001
      eta0: 0.0001
  l1_ratio: 0.746662757941
learning_rate: optimal
      loss: modified_huber
    n_iter: 18
    n_jobs: 8
   penalty: elasticnet
   power_t: 0.524669336646
iteration: (4/5) 8/8 score (roc_auc): 0.707 (0.870 +- 0.082)
iteration: (5/5) 8/8 score (roc_auc): 0.716 (0.882 +- 0.083)
Saved current best model in eden_model_active_RF00005
CPU times: user 1min 58s, sys: 21.1 s, total: 2min 19s
Wall time: 3min 35s

In [18]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.0001, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.74666275794065129,
       learning_rate='optimal', loss='modified_huber', n_iter=18, n_jobs=8,
       penalty='elasticnet', power_t=0.5246693366459444, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 538 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.91      0.92      0.92       100
          1       0.67      0.64      0.65        25

avg / total       0.86      0.86      0.86       125

APR: 0.726
ROC: 0.892
CPU times: user 1.01 s, sys: 203 ms, total: 1.21 s
Wall time: 2.3 s

In [19]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i


Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
0 -4.93692257937
1 -2.74610987177
2 -2.22186099905
3 -0.995063260768
4 -0.949031397056
5 -0.930207823853
6 -0.751964258797
7 -0.575625925686
8 -0.336611539503
9 -0.314518399351
10 0.366116664261
11 0.64558891188
12 0.813601783682
13 1.16723461776
14 1.55982448902
15 1.81945648844
16 2.24766922499
17 2.94408070282
18 3.35835161549
19 3.6058948847
20 4.1397528167
21 4.48350512652
22 4.58706965119
23 4.85000190096
24 5.35558453321
25 5.44303250289
26 5.66816181929
27 5.70053068416
28 5.73434252065
29 5.79376417289
30 5.92156399284
31 6.44350074678
32 7.2070276687
33 7.23461007192
34 8.17992764001
35 8.27971296395
36 8.85210752957
37 10.5540085204
38 10.7107895449
39 12.2171121516
40 12.2441419411
41 12.5749386764
42 13.4916899368
43 14.449795156
44 14.609005997
45 14.7850706707
46 14.8684982876
47 14.8814458026
48 17.3976129179
49 20.3497176348