In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path
from itertools import tee
from numpy.random import randint
from numpy.random import uniform
from eden.graph import Vectorizer
from sklearn.linear_model import SGDClassifier
import datetime, time
from eden.util import random_bipartition_iter
from eden.model import ActiveLearningBinaryClassificationModel

from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [3]:
def make_iterable(filename, file_format):
    if file_format == 'sdf':
        with open(filename) as f:
            s = ''
            for line in f:
                if line.strip() != '$$$$':
                    s = s + line
                else:
                    return_value = s + line
                    s = ''
                    yield return_value
    elif file_format == 'smi':
        with open(filename) as f:
            for line in f:
                yield line

This is where the data sets are defined:


In [4]:
AID = 720577
#AID=2401
DATA_DIR = '/home/liconj/proj/thesis/EDeN/examples/model_comparison/data'
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID


Original model

Functions for training and testing the model


In [5]:
model_fname = DATA_DIR + '/AID%s.model'%AID
model_type = "default"
n_conf = 10
n_iter = 50
active_set_size = 5
n_active_learning_iterations = 0
threshold = 1
train_test_split = 0.8


pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(3, 10, size=n_iter),
                          'model_type':[model_type],
                          'n_conf':[n_conf]}

def pre_processor(data, model_type="3d", **kwargs):
    # model_type = kwargs.get('mode', 'default')
    if model_type == "default":
        iterable = obabel.obabel_to_eden(data, **kwargs)
    elif model_type == "3d":
        iterable = obabel.obabel_to_eden3d(data, **kwargs)
    return iterable



vectorizer = Vectorizer()
estimator = SGDClassifier(class_weight='auto', shuffle=True)

# Make predictive model
model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                estimator=estimator,
                                                vectorizer=vectorizer,
                                                n_jobs=2,
                                                pre_processor_n_jobs=2,
                                                n_blocks = 10,
                                                fit_vectorizer=True)

In [6]:
########
# Create iterables from files
########

iterable_pos = make_iterable('AID720577_active.sdf', 'sdf')
iterable_neg = make_iterable('AID720577_inactive.sdf', 'sdf')
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

start = time.time()
print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))


iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

# Split train/test
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


# positives: 80  # negatives: 146 (0.2 sec 0:00:00.166845)

In [7]:
%%time
# Optimize hyperparameters and fit model
# Since this model is fitted much more slowly, use a single vectorizer
#vectorizer_parameters={'complexity':[2,3,4],
#                       'discretization_size':randint(2, 3,size=n_iter),
#                       'discretization_dimension':randint(2, 3,size=n_iter)}

vectorizer_parameters={'complexity':[4,5,6], 'n':[2,3,4]}


estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter),
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,-2)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"]}

model.optimize(iterable_pos_train, iterable_neg_train,
               model_name=model_fname,
               n_active_learning_iterations=0,
               size_positive=-1,
               size_negative=active_set_size,
               n_iter=n_iter, cv=3,
               pre_processor_parameters=pre_processor_parameters,
               vectorizer_parameters=vectorizer_parameters,
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
         k: [1 2 5 7 2 4 6 2 8 1 9 7 4 1 1 7 5 7 4 9 8 7 7 5 7 3 8 2 4 5 9 1 8 8 7 3 3
 1 3 3 8 7 7 5 6 8 5 3 6 5]
model_type: ['default']
    n_conf: [10]
 threshold: [4 9 3 9 9 5 4 8 4 6 8 6 8 8 7 9 4 8 9 7 5 3 7 9 3 7 8 3 3 4 7 6 8 6 4 7 9
 4 7 5 7 8 4 4 5 3 7 5 7 9]

Vectorizer:
complexity: [4, 5, 6]
         n: [2, 3, 4]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.16031039  0.19845281  0.81871     0.66751793  0.4009681   0.23889493
  0.71036693  0.50040598  0.15835753  0.6508024   0.36833993  0.13104658
  0.36660095  0.46036753  0.33050416  0.62682886  0.14074584  0.56182112
  0.34261588  0.36530662  0.85425686  0.7306414   0.60846963  0.42691566
  0.78952623  0.3694253   0.5744091   0.40837884  0.14373039  0.83631338
  0.52421769  0.24527844  0.88137749  0.65667606  0.14469994  0.75664721
  0.31348643  0.57295542  0.84086094  0.17308704  0.17949613  0.22120594
  0.58401019  0.56795334  0.41788938  0.86224158  0.30166915  0.37119235
  0.76752245  0.33569609]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [99 71 29 16 79 80 84 63 67 78  9 22 70 75 22 70 22 60 63 92 74 24 18 98 84
 78 82 38 63 84 88 69 81 33 67 86 56 25 22 90 92 43 47 45 96 32 65 17 41 23]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.15012097  0.67964475  0.39023425  0.82852946  0.2286062   0.39835285
  0.10721758  0.66577538  0.93493259  0.70153815  0.61538431  0.4775931
  0.19007952  0.30890724  0.16760926  0.75168972  0.18116357  0.84638624
  0.83215051  0.59720332  0.38576222  0.81601551  0.53809087  0.60850913
  0.22776765  0.56557303  0.94788029  0.4495919   0.29210874  0.33320798
  0.72680866  0.70388681  0.1521277   0.90242578  0.15514298  0.62850323
  0.4630725   0.43927169  0.69067657  0.49681397  0.98593875  0.12818548
  0.39682497  0.14863364  0.46884029  0.18818256  0.39492344  0.18406538
  0.11373548  0.13823853]
iteration: (1/5) 1/50 score (roc_auc): 0.561 (0.620 +- 0.059)


	Iteration: 1/50 (after 6.3 sec; 0:00:06.342705)
Best score (roc_auc): 0.561 (0.620 +- 0.059)

Data:
Instances: 180 ; Features: 1048577 with an avg of 584 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 4

Vectorizer:
complexity: 4
         n: 2

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.160310390705
learning_rate: invscaling
      loss: hinge
    n_iter: 78
   penalty: elasticnet
   power_t: 0.726808658574
iteration: (2/5) 1/50 score (roc_auc): 0.585 (0.703 +- 0.118)


	Iteration: 1/50 (after 8.8 sec; 0:00:08.817881)
Best score (roc_auc): 0.585 (0.703 +- 0.118)

Data:
Instances: 180 ; Features: 1048577 with an avg of 584 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 4

Vectorizer:
complexity: 4
         n: 2

Estimator:
     alpha: 0.0001
      eta0: 0.001
  l1_ratio: 0.767522449807
learning_rate: optimal
      loss: log
    n_iter: 67
   penalty: elasticnet
   power_t: 0.902425782015
iteration: (3/5) 1/50 score (roc_auc): 0.491 (0.584 +- 0.092)
iteration: (4/5) 1/50 score (roc_auc): 0.599 (0.629 +- 0.030)


	Iteration: 1/50 (after 11.1 sec; 0:00:11.067827)
Best score (roc_auc): 0.599 (0.629 +- 0.030)

Data:
Instances: 180 ; Features: 1048577 with an avg of 584 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 4

Vectorizer:
complexity: 4
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.01
  l1_ratio: 0.301669147655
learning_rate: optimal
      loss: log
    n_iter: 41
   penalty: l1
   power_t: 0.751689723567
iteration: (5/5) 1/50 score (roc_auc): 0.535 (0.656 +- 0.121)
iteration: (1/5) 2/50 score (roc_auc): 0.484 (0.558 +- 0.074)
iteration: (2/5) 2/50 score (roc_auc): 0.517 (0.641 +- 0.124)
iteration: (3/5) 2/50 score (roc_auc): 0.577 (0.678 +- 0.101)
iteration: (4/5) 2/50 score (roc_auc): 0.552 (0.674 +- 0.122)
iteration: (5/5) 2/50 score (roc_auc): 0.545 (0.659 +- 0.114)
iteration: (1/5) 3/50 score (roc_auc): 0.617 (0.621 +- 0.004)


	Iteration: 3/50 (after 27.0 sec; 0:00:27.008297)
Best score (roc_auc): 0.617 (0.621 +- 0.004)

Data:
Instances: 180 ; Features: 1048577 with an avg of 869 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 6
model_type: default
    n_conf: 10
 threshold: 7

Vectorizer:
complexity: 5
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.01
  l1_ratio: 0.52421768768
learning_rate: optimal
      loss: log
    n_iter: 16
   penalty: l1
   power_t: 0.496813969814
iteration: (2/5) 3/50 score (roc_auc): 0.537 (0.661 +- 0.124)
iteration: (3/5) 3/50 score (roc_auc): 0.539 (0.619 +- 0.080)
iteration: (4/5) 3/50 score (roc_auc): 0.540 (0.618 +- 0.078)
iteration: (5/5) 3/50 score (roc_auc): 0.443 (0.546 +- 0.103)
iteration: (1/5) 4/50 score (roc_auc): 0.497 (0.634 +- 0.137)
iteration: (2/5) 4/50 score (roc_auc): 0.543 (0.622 +- 0.079)
iteration: (3/5) 4/50 score (roc_auc): 0.485 (0.626 +- 0.141)
iteration: (4/5) 4/50 score (roc_auc): 0.540 (0.675 +- 0.135)
iteration: (5/5) 4/50 score (roc_auc): 0.552 (0.624 +- 0.072)
iteration: (1/5) 5/50 score (roc_auc): 0.485 (0.638 +- 0.153)
iteration: (2/5) 5/50 score (roc_auc): 0.633 (0.669 +- 0.036)


	Iteration: 5/50 (after 48.0 sec; 0:00:47.971685)
Best score (roc_auc): 0.633 (0.669 +- 0.036)

Data:
Instances: 180 ; Features: 1048577 with an avg of 869 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 7
model_type: default
    n_conf: 10
 threshold: 9

Vectorizer:
complexity: 5
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.0001
  l1_ratio: 0.36942530135
learning_rate: optimal
      loss: hinge
    n_iter: 22
   penalty: l1
   power_t: 0.128185483351
iteration: (3/5) 5/50 score (roc_auc): 0.480 (0.621 +- 0.141)
iteration: (4/5) 5/50 score (roc_auc): 0.585 (0.687 +- 0.102)
iteration: (5/5) 5/50 score (roc_auc): 0.529 (0.657 +- 0.128)
iteration: (1/5) 6/50 score (roc_auc): 0.545 (0.677 +- 0.133)
iteration: (2/5) 6/50 score (roc_auc): 0.540 (0.621 +- 0.080)
iteration: (3/5) 6/50 score (roc_auc): 0.534 (0.628 +- 0.094)
iteration: (4/5) 6/50 score (roc_auc): 0.542 (0.619 +- 0.077)
iteration: (5/5) 6/50 score (roc_auc): 0.541 (0.620 +- 0.079)
iteration: (1/5) 7/50 score (roc_auc): 0.560 (0.652 +- 0.092)
iteration: (2/5) 7/50 score (roc_auc): 0.502 (0.650 +- 0.148)
iteration: (3/5) 7/50 score (roc_auc): 0.531 (0.607 +- 0.076)
iteration: (4/5) 7/50 score (roc_auc): 0.630 (0.664 +- 0.034)
iteration: (5/5) 7/50 score (roc_auc): 0.555 (0.593 +- 0.038)
iteration: (1/5) 8/50 score (roc_auc): 0.544 (0.621 +- 0.076)
iteration: (2/5) 8/50 score (roc_auc): 0.532 (0.671 +- 0.139)
iteration: (3/5) 8/50 score (roc_auc): 0.559 (0.636 +- 0.077)
iteration: (4/5) 8/50 score (roc_auc): 0.552 (0.605 +- 0.052)
iteration: (5/5) 8/50 score (roc_auc): 0.557 (0.624 +- 0.067)
iteration: (1/5) 9/50 score (roc_auc): 0.541 (0.646 +- 0.105)
iteration: (2/5) 9/50 score (roc_auc): 0.533 (0.623 +- 0.090)
iteration: (3/5) 9/50 score (roc_auc): 0.529 (0.622 +- 0.093)
iteration: (4/5) 9/50 score (roc_auc): 0.533 (0.650 +- 0.117)
iteration: (5/5) 9/50 score (roc_auc): 0.543 (0.650 +- 0.107)
iteration: (1/5) 10/50 score (roc_auc): 0.532 (0.607 +- 0.075)
iteration: (2/5) 10/50 score (roc_auc): 0.475 (0.630 +- 0.155)
iteration: (3/5) 10/50 score (roc_auc): 0.534 (0.608 +- 0.074)
iteration: (4/5) 10/50 score (roc_auc): 0.569 (0.642 +- 0.073)
iteration: (5/5) 10/50 score (roc_auc): 0.562 (0.627 +- 0.065)
iteration: (1/5) 11/50 score (roc_auc): 0.500 (0.624 +- 0.124)
iteration: (2/5) 11/50 score (roc_auc): 0.553 (0.631 +- 0.078)
iteration: (3/5) 11/50 score (roc_auc): 0.516 (0.608 +- 0.093)
iteration: (4/5) 11/50 score (roc_auc): 0.549 (0.625 +- 0.076)
iteration: (5/5) 11/50 score (roc_auc): 0.537 (0.594 +- 0.057)
iteration: (1/5) 12/50 score (roc_auc): 0.538 (0.618 +- 0.080)
iteration: (2/5) 12/50 score (roc_auc): 0.534 (0.617 +- 0.083)
iteration: (3/5) 12/50 score (roc_auc): 0.559 (0.596 +- 0.037)
iteration: (4/5) 12/50 score (roc_auc): 0.534 (0.638 +- 0.104)
iteration: (5/5) 12/50 score (roc_auc): 0.539 (0.618 +- 0.079)
iteration: (1/5) 13/50 score (roc_auc): 0.521 (0.645 +- 0.124)
iteration: (2/5) 13/50 score (roc_auc): 0.539 (0.618 +- 0.079)
iteration: (3/5) 13/50 score (roc_auc): 0.542 (0.622 +- 0.080)
iteration: (4/5) 13/50 score (roc_auc): 0.598 (0.693 +- 0.096)
iteration: (5/5) 13/50 score (roc_auc): 0.535 (0.662 +- 0.128)
iteration: (1/5) 14/50 score (roc_auc): 0.496 (0.647 +- 0.151)
iteration: (2/5) 14/50 score (roc_auc): 0.535 (0.629 +- 0.094)
iteration: (3/5) 14/50 score (roc_auc): 0.538 (0.675 +- 0.137)
iteration: (4/5) 14/50 score (roc_auc): 0.537 (0.628 +- 0.091)
iteration: (5/5) 14/50 score (roc_auc): 0.542 (0.620 +- 0.078)
iteration: (1/5) 15/50 score (roc_auc): 0.512 (0.584 +- 0.072)
iteration: (2/5) 15/50 score (roc_auc): 0.559 (0.677 +- 0.118)
iteration: (3/5) 15/50 score (roc_auc): 0.579 (0.674 +- 0.095)
iteration: (4/5) 15/50 score (roc_auc): 0.568 (0.688 +- 0.120)
iteration: (5/5) 15/50 score (roc_auc): 0.534 (0.608 +- 0.074)
iteration: (1/5) 16/50 score (roc_auc): 0.523 (0.617 +- 0.093)
iteration: (2/5) 16/50 score (roc_auc): 0.552 (0.664 +- 0.113)
iteration: (3/5) 16/50 score (roc_auc): 0.516 (0.625 +- 0.109)
iteration: (4/5) 16/50 score (roc_auc): 0.507 (0.657 +- 0.149)
iteration: (5/5) 16/50 score (roc_auc): 0.543 (0.648 +- 0.105)
iteration: (1/5) 17/50 score (roc_auc): 0.538 (0.618 +- 0.080)
iteration: (2/5) 17/50 score (roc_auc): 0.526 (0.644 +- 0.118)
iteration: (3/5) 17/50 score (roc_auc): 0.574 (0.695 +- 0.121)
iteration: (4/5) 17/50 score (roc_auc): 0.531 (0.623 +- 0.092)
iteration: (5/5) 17/50 score (roc_auc): 0.544 (0.672 +- 0.128)
iteration: (1/5) 18/50 score (roc_auc): 0.537 (0.676 +- 0.139)
iteration: (2/5) 18/50 score (roc_auc): 0.498 (0.641 +- 0.143)
iteration: (3/5) 18/50 score (roc_auc): 0.611 (0.620 +- 0.008)
iteration: (4/5) 18/50 score (roc_auc): 0.539 (0.618 +- 0.079)
iteration: (5/5) 18/50 score (roc_auc): 0.578 (0.643 +- 0.065)
iteration: (1/5) 19/50 score (roc_auc): 0.527 (0.666 +- 0.139)
iteration: (2/5) 19/50 score (roc_auc): 0.478 (0.639 +- 0.161)
iteration: (3/5) 19/50 score (roc_auc): 0.507 (0.650 +- 0.143)
iteration: (4/5) 19/50 score (roc_auc): 0.549 (0.616 +- 0.067)
iteration: (5/5) 19/50 score (roc_auc): 0.557 (0.634 +- 0.078)
iteration: (1/5) 20/50 score (roc_auc): 0.513 (0.644 +- 0.131)
iteration: (2/5) 20/50 score (roc_auc): 0.539 (0.618 +- 0.079)
iteration: (3/5) 20/50 score (roc_auc): 0.527 (0.655 +- 0.127)
iteration: (4/5) 20/50 score (roc_auc): 0.507 (0.631 +- 0.124)
iteration: (5/5) 20/50 score (roc_auc): 0.544 (0.625 +- 0.081)
iteration: (1/5) 21/50 score (roc_auc): 0.534 (0.608 +- 0.074)
iteration: (2/5) 21/50 score (roc_auc): 0.559 (0.664 +- 0.105)
iteration: (3/5) 21/50 score (roc_auc): 0.528 (0.610 +- 0.082)
iteration: (4/5) 21/50 score (roc_auc): 0.529 (0.612 +- 0.083)
iteration: (5/5) 21/50 score (roc_auc): 0.534 (0.608 +- 0.074)
iteration: (1/5) 22/50 score (roc_auc): 0.470 (0.616 +- 0.146)
iteration: (2/5) 22/50 score (roc_auc): 0.542 (0.622 +- 0.080)
iteration: (3/5) 22/50 score (roc_auc): 0.535 (0.662 +- 0.127)
iteration: (4/5) 22/50 score (roc_auc): 0.524 (0.645 +- 0.121)
iteration: (5/5) 22/50 score (roc_auc): 0.563 (0.637 +- 0.074)
iteration: (1/5) 23/50 score (roc_auc): 0.544 (0.685 +- 0.141)
iteration: (2/5) 23/50 score (roc_auc): 0.573 (0.696 +- 0.122)
iteration: (3/5) 23/50 score (roc_auc): 0.582 (0.636 +- 0.055)
iteration: (4/5) 23/50 score (roc_auc): 0.533 (0.627 +- 0.094)
iteration: (5/5) 23/50 score (roc_auc): 0.601 (0.712 +- 0.111)
iteration: (1/5) 24/50 score (roc_auc): 0.528 (0.612 +- 0.084)
iteration: (2/5) 24/50 score (roc_auc): 0.517 (0.653 +- 0.136)
iteration: (3/5) 24/50 score (roc_auc): 0.582 (0.674 +- 0.091)
iteration: (4/5) 24/50 score (roc_auc): 0.531 (0.618 +- 0.087)
iteration: (5/5) 24/50 score (roc_auc): 0.505 (0.632 +- 0.127)
iteration: (1/5) 25/50 score (roc_auc): 0.482 (0.646 +- 0.164)
iteration: (2/5) 25/50 score (roc_auc): 0.549 (0.614 +- 0.065)
iteration: (3/5) 25/50 score (roc_auc): 0.523 (0.675 +- 0.152)
iteration: (4/5) 25/50 score (roc_auc): 0.505 (0.648 +- 0.143)
iteration: (5/5) 25/50 score (roc_auc): 0.548 (0.623 +- 0.076)


	Parameters range:

Pre_processor:
         k: [1, 1, 1, 6, 7]
model_type: ['default', 'default', 'default', 'default', 'default']
    n_conf: [10, 10, 10, 10, 10]
 threshold: [4, 4, 4, 7, 9]

Vectorizer:
complexity: [4, 4, 4, 5, 5]
         n: [2, 2, 2, 2, 2]

Estimator:
     alpha: [1e-07, 0.0001, 0.001, 0.001, 0.001]
      eta0: [0.01, 0.001, 0.01, 0.01, 0.0001]
  l1_ratio: [0.16031039070520708, 0.76752244980748252, 0.30166914765548186, 0.52421768767965704, 0.36942530134972451]
learning_rate: ['invscaling', 'optimal', 'optimal', 'optimal', 'optimal']
      loss: ['hinge', 'log', 'log', 'log', 'hinge']
    n_iter: [78, 67, 41, 16, 22]
   penalty: ['elasticnet', 'elasticnet', 'l1', 'l1', 'l1']
   power_t: [0.72680865857370514, 0.90242578201549728, 0.75168972356655483, 0.49681396981441972, 0.1281854833514135]
iteration: (1/5) 26/50 score (roc_auc): 0.605 (0.661 +- 0.055)
iteration: (2/5) 26/50 score (roc_auc): 0.610 (0.618 +- 0.008)
iteration: (3/5) 26/50 score (roc_auc): 0.548 (0.621 +- 0.072)
iteration: (4/5) 26/50 score (roc_auc): 0.506 (0.651 +- 0.146)
iteration: (5/5) 26/50 score (roc_auc): 0.585 (0.686 +- 0.101)
iteration: (1/5) 27/50 score (roc_auc): 0.526 (0.664 +- 0.138)
iteration: (2/5) 27/50 score (roc_auc): 0.630 (0.708 +- 0.078)
iteration: (3/5) 27/50 score (roc_auc): 0.578 (0.691 +- 0.113)
iteration: (4/5) 27/50 score (roc_auc): 0.544 (0.641 +- 0.097)
iteration: (5/5) 27/50 score (roc_auc): 0.614 (0.620 +- 0.007)
iteration: (1/5) 28/50 score (roc_auc): 0.553 (0.597 +- 0.044)
iteration: (2/5) 28/50 score (roc_auc): 0.567 (0.623 +- 0.056)
iteration: (3/5) 28/50 score (roc_auc): 0.547 (0.592 +- 0.045)
iteration: (4/5) 28/50 score (roc_auc): 0.596 (0.697 +- 0.101)
iteration: (5/5) 28/50 score (roc_auc): 0.607 (0.669 +- 0.062)
iteration: (1/5) 29/50 score (roc_auc): 0.608 (0.674 +- 0.067)
iteration: (2/5) 29/50 score (roc_auc): 0.633 (0.692 +- 0.059)


	Iteration: 29/50 (after 353.0 sec; 0:05:52.962020)
Best score (roc_auc): 0.633 (0.692 +- 0.059)

Data:
Instances: 180 ; Features: 1048577 with an avg of 584 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 9

Vectorizer:
complexity: 4
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.001
  l1_ratio: 0.52421768768
learning_rate: optimal
      loss: hinge
    n_iter: 22
   penalty: l1
   power_t: 0.128185483351
iteration: (3/5) 29/50 score (roc_auc): 0.586 (0.619 +- 0.033)
iteration: (4/5) 29/50 score (roc_auc): 0.586 (0.619 +- 0.033)
iteration: (5/5) 29/50 score (roc_auc): 0.585 (0.704 +- 0.119)
iteration: (1/5) 30/50 score (roc_auc): 0.635 (0.689 +- 0.054)


	Iteration: 30/50 (after 368.4 sec; 0:06:08.377476)
Best score (roc_auc): 0.635 (0.689 +- 0.054)

Data:
Instances: 180 ; Features: 1048577 with an avg of 869 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 7

Vectorizer:
complexity: 5
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.01
  l1_ratio: 0.767522449807
learning_rate: optimal
      loss: hinge
    n_iter: 78
   penalty: l1
   power_t: 0.496813969814
iteration: (2/5) 30/50 score (roc_auc): 0.542 (0.613 +- 0.071)
iteration: (3/5) 30/50 score (roc_auc): 0.613 (0.621 +- 0.007)
iteration: (4/5) 30/50 score (roc_auc): 0.587 (0.636 +- 0.050)
iteration: (5/5) 30/50 score (roc_auc): 0.585 (0.602 +- 0.017)
iteration: (1/5) 31/50 score (roc_auc): 0.598 (0.631 +- 0.032)
iteration: (2/5) 31/50 score (roc_auc): 0.544 (0.630 +- 0.086)
iteration: (3/5) 31/50 score (roc_auc): 0.539 (0.676 +- 0.137)
iteration: (4/5) 31/50 score (roc_auc): 0.634 (0.702 +- 0.068)
iteration: (5/5) 31/50 score (roc_auc): 0.572 (0.596 +- 0.024)
iteration: (1/5) 32/50 score (roc_auc): 0.617 (0.687 +- 0.070)
iteration: (2/5) 32/50 score (roc_auc): 0.583 (0.618 +- 0.036)
iteration: (3/5) 32/50 score (roc_auc): 0.592 (0.667 +- 0.075)
iteration: (4/5) 32/50 score (roc_auc): 0.636 (0.706 +- 0.071)


	Iteration: 32/50 (after 398.4 sec; 0:06:38.430165)
Best score (roc_auc): 0.636 (0.706 +- 0.071)

Data:
Instances: 180 ; Features: 1048577 with an avg of 584 features per instance
class: 1 count:64 (0.36)	class: -1 count:116 (0.64)	

	Model parameters:

Pre_processor:
         k: 1
model_type: default
    n_conf: 10
 threshold: 7

Vectorizer:
complexity: 4
         n: 2

Estimator:
     alpha: 0.001
      eta0: 0.01
  l1_ratio: 0.767522449807
learning_rate: optimal
      loss: hinge
    n_iter: 67
   penalty: l1
   power_t: 0.751689723567
iteration: (5/5) 32/50 score (roc_auc): 0.532 (0.661 +- 0.129)
iteration: (1/5) 33/50 score (roc_auc): 0.551 (0.692 +- 0.142)
iteration: (2/5) 33/50 score (roc_auc): 0.592 (0.626 +- 0.034)
iteration: (3/5) 33/50 score (roc_auc): 0.499 (0.592 +- 0.093)
iteration: (4/5) 33/50 score (roc_auc): 0.614 (0.656 +- 0.042)
iteration: (5/5) 33/50 score (roc_auc): 0.627 (0.702 +- 0.076)
iteration: (1/5) 34/50 score (roc_auc): 0.605 (0.664 +- 0.058)
iteration: (2/5) 34/50 score (roc_auc): 0.543 (0.652 +- 0.109)
iteration: (3/5) 34/50 score (roc_auc): 0.506 (0.627 +- 0.121)
iteration: (4/5) 34/50 score (roc_auc): 0.534 (0.640 +- 0.107)
iteration: (5/5) 34/50 score (roc_auc): 0.615 (0.653 +- 0.038)
iteration: (1/5) 35/50 score (roc_auc): 0.627 (0.682 +- 0.056)
iteration: (2/5) 35/50 score (roc_auc): 0.534 (0.603 +- 0.070)
iteration: (3/5) 35/50 score (roc_auc): 0.623 (0.653 +- 0.030)
iteration: (4/5) 35/50 score (roc_auc): 0.613 (0.661 +- 0.048)
iteration: (5/5) 35/50 score (roc_auc): 0.611 (0.619 +- 0.008)
iteration: (1/5) 36/50 score (roc_auc): 0.626 (0.704 +- 0.078)
iteration: (2/5) 36/50 score (roc_auc): 0.572 (0.693 +- 0.122)
iteration: (3/5) 36/50 score (roc_auc): 0.532 (0.661 +- 0.129)
iteration: (4/5) 36/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (5/5) 36/50 score (roc_auc): 0.562 (0.597 +- 0.036)
iteration: (1/5) 37/50 score (roc_auc): 0.518 (0.626 +- 0.107)
iteration: (2/5) 37/50 score (roc_auc): 0.626 (0.674 +- 0.048)
iteration: (3/5) 37/50 score (roc_auc): 0.535 (0.621 +- 0.086)
iteration: (4/5) 37/50 score (roc_auc): 0.611 (0.619 +- 0.008)
iteration: (5/5) 37/50 score (roc_auc): 0.627 (0.662 +- 0.035)
iteration: (1/5) 38/50 score (roc_auc): 0.584 (0.622 +- 0.037)
iteration: (2/5) 38/50 score (roc_auc): 0.627 (0.702 +- 0.076)
iteration: (3/5) 38/50 score (roc_auc): 0.590 (0.701 +- 0.111)
iteration: (4/5) 38/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (5/5) 38/50 score (roc_auc): 0.543 (0.613 +- 0.071)
iteration: (1/5) 39/50 score (roc_auc): 0.585 (0.619 +- 0.034)
iteration: (2/5) 39/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (3/5) 39/50 score (roc_auc): 0.586 (0.705 +- 0.118)
iteration: (4/5) 39/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (5/5) 39/50 score (roc_auc): 0.537 (0.669 +- 0.132)
iteration: (1/5) 40/50 score (roc_auc): 0.593 (0.627 +- 0.034)
iteration: (2/5) 40/50 score (roc_auc): 0.592 (0.626 +- 0.034)
iteration: (3/5) 40/50 score (roc_auc): 0.557 (0.690 +- 0.133)
iteration: (4/5) 40/50 score (roc_auc): 0.582 (0.701 +- 0.119)
iteration: (5/5) 40/50 score (roc_auc): 0.622 (0.700 +- 0.078)
iteration: (1/5) 41/50 score (roc_auc): 0.551 (0.675 +- 0.124)
iteration: (2/5) 41/50 score (roc_auc): 0.627 (0.682 +- 0.056)
iteration: (3/5) 41/50 score (roc_auc): 0.611 (0.619 +- 0.008)
iteration: (4/5) 41/50 score (roc_auc): 0.502 (0.623 +- 0.121)
iteration: (5/5) 41/50 score (roc_auc): 0.600 (0.612 +- 0.012)
iteration: (1/5) 42/50 score (roc_auc): 0.502 (0.637 +- 0.135)
iteration: (2/5) 42/50 score (roc_auc): 0.550 (0.587 +- 0.036)
iteration: (3/5) 42/50 score (roc_auc): 0.584 (0.622 +- 0.037)
iteration: (4/5) 42/50 score (roc_auc): 0.585 (0.619 +- 0.034)
iteration: (5/5) 42/50 score (roc_auc): 0.583 (0.701 +- 0.118)
iteration: (1/5) 43/50 score (roc_auc): 0.584 (0.622 +- 0.037)
iteration: (2/5) 43/50 score (roc_auc): 0.593 (0.627 +- 0.034)
iteration: (3/5) 43/50 score (roc_auc): 0.624 (0.707 +- 0.082)
iteration: (4/5) 43/50 score (roc_auc): 0.570 (0.692 +- 0.122)
iteration: (5/5) 43/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (1/5) 44/50 score (roc_auc): 0.585 (0.698 +- 0.113)
iteration: (2/5) 44/50 score (roc_auc): 0.603 (0.672 +- 0.068)
iteration: (3/5) 44/50 score (roc_auc): 0.627 (0.702 +- 0.076)
iteration: (4/5) 44/50 score (roc_auc): 0.584 (0.622 +- 0.037)
iteration: (5/5) 44/50 score (roc_auc): 0.626 (0.704 +- 0.078)
iteration: (1/5) 45/50 score (roc_auc): 0.585 (0.619 +- 0.034)
iteration: (2/5) 45/50 score (roc_auc): 0.537 (0.592 +- 0.055)
iteration: (3/5) 45/50 score (roc_auc): 0.537 (0.592 +- 0.055)
iteration: (4/5) 45/50 score (roc_auc): 0.626 (0.704 +- 0.078)
iteration: (5/5) 45/50 score (roc_auc): 0.582 (0.697 +- 0.116)
iteration: (1/5) 46/50 score (roc_auc): 0.630 (0.686 +- 0.055)
iteration: (2/5) 46/50 score (roc_auc): 0.627 (0.662 +- 0.035)
iteration: (3/5) 46/50 score (roc_auc): 0.534 (0.640 +- 0.107)
iteration: (4/5) 46/50 score (roc_auc): 0.627 (0.682 +- 0.056)
iteration: (5/5) 46/50 score (roc_auc): 0.543 (0.615 +- 0.073)
iteration: (1/5) 47/50 score (roc_auc): 0.585 (0.619 +- 0.034)
iteration: (2/5) 47/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (3/5) 47/50 score (roc_auc): 0.584 (0.622 +- 0.037)
iteration: (4/5) 47/50 score (roc_auc): 0.593 (0.627 +- 0.034)
iteration: (5/5) 47/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (1/5) 48/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (2/5) 48/50 score (roc_auc): 0.531 (0.661 +- 0.129)
iteration: (3/5) 48/50 score (roc_auc): 0.625 (0.688 +- 0.063)
iteration: (4/5) 48/50 score (roc_auc): 0.608 (0.638 +- 0.030)
iteration: (5/5) 48/50 score (roc_auc): 0.556 (0.697 +- 0.141)
iteration: (1/5) 49/50 score (roc_auc): 0.634 (0.687 +- 0.053)
iteration: (2/5) 49/50 score (roc_auc): 0.502 (0.623 +- 0.121)
iteration: (3/5) 49/50 score (roc_auc): 0.627 (0.662 +- 0.035)
iteration: (4/5) 49/50 score (roc_auc): 0.557 (0.596 +- 0.039)
iteration: (5/5) 49/50 score (roc_auc): 0.554 (0.672 +- 0.119)
iteration: (1/5) 50/50 score (roc_auc): 0.556 (0.697 +- 0.141)
iteration: (2/5) 50/50 score (roc_auc): 0.627 (0.702 +- 0.076)
iteration: (3/5) 50/50 score (roc_auc): 0.626 (0.704 +- 0.078)
iteration: (4/5) 50/50 score (roc_auc): 0.626 (0.704 +- 0.078)
iteration: (5/5) 50/50 score (roc_auc): 0.622 (0.683 +- 0.061)
Saved current best model in /home/liconj/proj/thesis/EDeN/examples/model_comparison/data/AID720577.model
CPU times: user 1min 5s, sys: 15.6 s, total: 1min 21s
Wall time: 9min 50s

In [8]:
%%time
# Estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.001, average=False, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.76752244980748252,
       learning_rate='optimal', loss='hinge', n_iter=67, n_jobs=1,
       penalty='l1', power_t=0.75168972356655483, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 46 ; Features: 1048577 with an avg of 598 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.71      0.57      0.63        30
          1       0.41      0.56      0.47        16

avg / total       0.60      0.57      0.58        46

APR: 0.493
ROC: 0.617
CPU times: user 219 ms, sys: 89.9 ms, total: 309 ms
Wall time: 1.05 s
Out[8]:
(0.49254906850739988, 0.61666666666666659)


3D model


In [13]:
model_fname = DATA_DIR + '/AID%s.model3d'%AID
model_type = "3d"
n_conf = 10
n_iter = 200
active_set_size = 5
n_active_learning_iterations = 0
train_test_split = 0.8


pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(3, 10, size=n_iter),
                          'model_type':[model_type],
                          'n_conf':[n_conf]}

def pre_processor(data, model_type="3d", **kwargs):
    # model_type = kwargs.get('mode', 'default')
    if model_type == "default":
        iterable = obabel.obabel_to_eden(data, **kwargs)
    elif model_type == "3d":
        iterable = obabel.obabel_to_eden3d(data, **kwargs)
    return iterable



vectorizer = Vectorizer()
estimator = SGDClassifier(class_weight='auto', shuffle=True)

# Make predictive model
model3d = ActiveLearningBinaryClassificationModel(pre_processor,
                                                  estimator=estimator,
                                                  vectorizer=vectorizer,
                                                  n_jobs = 1,
                                                  pre_processor_n_jobs = 1,
                                                  n_blocks = 10,
                                                  fit_vectorizer=True)

In [14]:
########
# Create iterables from files
########

iterable_pos = make_iterable('AID720577_active.sdf', 'sdf')
iterable_neg = make_iterable('AID720577_inactive.sdf', 'sdf')
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

start = time.time()
print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))


iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

# Split train/test
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


# positives: 80  # negatives: 146 (0.0 sec 0:00:00.037743)

In [15]:
model3d.fit_vectorizer


Out[15]:
True

In [16]:
#%%time
# Optimize hyperparameters and fit model

vectorizer_parameters={'complexity':[6], 'n':[2,3,4]}


estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter),
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,-2)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"]}

model3d.optimize(iterable_pos_train, iterable_neg_train,
               model_name=model_fname,
               n_iter=n_iter, cv=3,
               pre_processor_parameters=pre_processor_parameters,
               vectorizer_parameters=vectorizer_parameters,
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
         k: [1 8 9 9 1 7 5 1 1 7 5 3 1 9 1 1 4 7 3 7 4 5 9 5 4 9 7 3 5 7 4 7 4 7 9 6 4
 3 3 9 1 4 9 1 8 2 3 1 2 8 8 1 2 8 4 9 6 2 8 1 4 2 3 3 6 2 4 4 4 7 9 2 4 6
 5 8 5 2 9 9 5 9 1 1 9 8 9 1 6 1 9 5 1 4 3 7 1 9 7 2 2 4 7 8 5 9 6 3 7 4 3
 9 4 7 6 8 6 4 6 6 2 9 5 2 4 3 7 2 3 4 9 4 9 1 1 6 2 4 5 3 4 3 9 2 9 4 3 6
 6 7 8 1 9 5 5 2 1 4 4 4 9 6 3 1 9 2 2 4 7 1 1 2 2 4 2 2 1 2 7 7 7 4 4 7 2
 1 5 8 9 4 8 4 5 9 1 5 8 1 4 7]
model_type: ['3d']
    n_conf: [10]
 threshold: [4 6 3 8 4 5 7 8 7 4 3 6 7 4 5 5 3 5 8 3 8 3 3 7 3 7 3 5 7 9 5 4 3 7 7 8 7
 9 7 3 4 3 7 7 8 6 3 6 5 5 7 3 5 7 8 5 3 9 3 8 3 6 5 9 6 8 9 6 7 9 9 4 3 3
 8 7 7 8 4 8 5 8 5 4 6 7 7 9 3 8 4 4 3 7 8 5 6 3 6 4 6 9 6 9 7 7 4 6 9 8 9
 6 7 3 8 4 9 9 6 5 7 6 3 3 4 9 6 6 6 3 4 4 5 7 9 9 7 7 9 9 9 7 6 4 6 4 5 6
 5 7 8 5 9 4 8 9 8 6 3 9 7 4 9 3 7 4 3 9 6 4 6 5 4 5 4 7 5 5 5 3 3 7 6 5 4
 3 8 7 7 3 8 5 4 7 8 7 4 8 3 9]

Vectorizer:
complexity: [6]
         n: [2, 3, 4]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.11273152  0.11991159  0.66697868  0.20502886  0.86726341  0.62745349
  0.52572065  0.37076132  0.59735328  0.4283104   0.5454762   0.18595832
  0.15926334  0.41280028  0.60606775  0.77928876  0.62246     0.61217913
  0.1699089   0.42016263  0.12713924  0.56426765  0.58821908  0.11736766
  0.72398518  0.13299696  0.13177103  0.53815239  0.67960735  0.61160101
  0.80325831  0.78472932  0.38472988  0.80592696  0.5946277   0.72988331
  0.34318314  0.409473    0.39703232  0.60239164  0.78087624  0.24956349
  0.32621954  0.89350332  0.40914926  0.27743467  0.34142982  0.23820054
  0.36182605  0.36092419  0.62926191  0.20323529  0.2979568   0.82564579
  0.56607559  0.84565936  0.21415717  0.45521579  0.19815394  0.31796694
  0.19831277  0.41419149  0.69886732  0.8417512   0.89228554  0.49850961
  0.80855962  0.55471632  0.21907737  0.17101447  0.14489379  0.86323902
  0.27497987  0.57521519  0.44414267  0.77006449  0.68987287  0.23731459
  0.62330054  0.17548121  0.73304703  0.42284607  0.531953    0.10740395
  0.5999974   0.66978431  0.14471962  0.60916267  0.54358031  0.25284803
  0.32969746  0.5823043   0.51393324  0.26833123  0.45631765  0.472745
  0.79985133  0.52753049  0.63962277  0.19400915  0.89983058  0.80464137
  0.69146703  0.23849243  0.10581234  0.3206996   0.62156142  0.55669718
  0.12135826  0.59661838  0.36870586  0.50956433  0.30937827  0.25501844
  0.80236666  0.73805262  0.83056325  0.39195614  0.84643785  0.48303725
  0.57277919  0.23845437  0.62580929  0.27929134  0.68436732  0.11257183
  0.41851262  0.65136741  0.35159759  0.27707442  0.32479772  0.83551303
  0.17902532  0.84054244  0.44791437  0.56955024  0.68213513  0.22559133
  0.77658355  0.87896398  0.85376443  0.41609871  0.27265154  0.78141889
  0.38014584  0.45084508  0.59173134  0.34640734  0.59068043  0.62042629
  0.27416014  0.14943216  0.78021076  0.50168532  0.85755387  0.88023855
  0.78468158  0.42358815  0.77612255  0.62886011  0.70409135  0.15042933
  0.55493316  0.30748831  0.2441698   0.44522053  0.69588428  0.66084199
  0.80405341  0.81301717  0.65104815  0.77191324  0.10983834  0.7968046
  0.69788328  0.46232228  0.54484034  0.61507984  0.581288    0.52159422
  0.62694458  0.34112358  0.78188335  0.45288487  0.48447336  0.79321803
  0.16688171  0.15046325  0.11237454  0.14697881  0.76149262  0.75396645
  0.82008466  0.22500078  0.55401913  0.76350901  0.55865102  0.86020495
  0.27323113  0.42730125]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [56 56 40 19 61 31 55 87 33 29  9  6 51 80 68 32 18 68 60 38 55 42 92 49 36
 83 93 54 19 75 42 74 10 70 42 31 81 91 64 96 94 90 62 55 49 34 26 89 83 25
 80 64 29 11 43 90 45 97 57 88 19 11 18 46 92 34  8 15 58 92 89 21 44 30 15
 91 58 93 38 13 85 32 62 70  9 94 27 18 36  6 67 97  5 79 59 89 60 43 88 46
 71 83 21 55 80 47 75 21 99 89 53 11 57 73 97  7 71 80 96 47  6 40 32 80 81
 77  7 22 64 42 30 67 35 45 26 34 94 33 50 65 36 40 64 18 55 22 91 11 15 93
 69 98 75 96 85 64 70 57 73 77 45 39 44 45  9 41 96 86 34 66 36 63 16 56 52
 59 23 54 37 86  5 98 92 59 13 70 40 16 29 82 34 71 16 35 25 65 63 70 30  5]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.85418527  0.993655    0.99505142  0.61029671  0.42835524  0.17532073
  0.6694998   0.3847786   0.80635648  0.40975458  0.58934741  0.16493772
  0.81918585  0.55259176  0.44937733  0.13672956  0.74297006  0.18895944
  0.1818122   0.12150092  0.7029641   0.17546984  0.88638861  0.171902
  0.32529594  0.71805697  0.15174213  0.52850488  0.63163219  0.98565082
  0.68748861  0.71069243  0.34801332  0.61267266  0.47611332  0.80952297
  0.22754312  0.85915609  0.69052677  0.44208772  0.88596859  0.88019756
  0.54950826  0.84379443  0.58204099  0.94239544  0.6896667   0.40550871
  0.4369512   0.92336964  0.14574161  0.97474546  0.14240127  0.56999251
  0.75198173  0.7010669   0.56685349  0.84714918  0.92096485  0.15002241
  0.73026281  0.28063715  0.80528012  0.78556997  0.36680492  0.48862072
  0.88872353  0.29452876  0.68932465  0.2892844   0.97392197  0.28791305
  0.94544741  0.80458218  0.15556947  0.98062044  0.50364844  0.38612008
  0.93470994  0.43672463  0.22987001  0.37916079  0.74409601  0.53583374
  0.5248004   0.4234665   0.5094887   0.80338722  0.71401229  0.23737457
  0.43210171  0.83969802  0.98808153  0.92644269  0.54769707  0.21106835
  0.14897217  0.42011444  0.87004852  0.1283732   0.70076205  0.90757953
  0.15739462  0.95185588  0.65804261  0.59583005  0.72619241  0.79462319
  0.83608348  0.43943122  0.68476522  0.43014643  0.79701818  0.29275927
  0.70307602  0.17625805  0.95462568  0.94707102  0.64971502  0.35728831
  0.36232428  0.56517768  0.68999461  0.42060089  0.37115033  0.98353034
  0.50923818  0.52123961  0.35315749  0.87889753  0.31739188  0.39061622
  0.29408424  0.34767105  0.9332576   0.93849348  0.19275782  0.16946046
  0.18651028  0.70875137  0.43319243  0.32413699  0.77797714  0.84965949
  0.16718905  0.89324142  0.8877169   0.35388757  0.82333076  0.98065735
  0.93618659  0.47692648  0.31663962  0.67305045  0.74512228  0.29274806
  0.56787679  0.50406847  0.79570382  0.74392195  0.68377903  0.89263837
  0.99650645  0.48416214  0.10084435  0.52997074  0.44903938  0.66534101
  0.22748678  0.72894605  0.32346126  0.8875769   0.41006538  0.59722969
  0.45058326  0.52897366  0.59265287  0.35976127  0.47191951  0.75655425
  0.12226017  0.61416973  0.83098692  0.12849561  0.22798547  0.71814969
  0.35782037  0.37012134  0.16080997  0.27660787  0.33834476  0.22655488
  0.14742366  0.76708474  0.28634289  0.29696978  0.5618372   0.78815242
  0.1491837   0.32210004]
Program run failed on Monday, 20. July 2015 11:43AM
Sequence index out of range.
list index out of range
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-16-b5a99f0d869e> in <module>()
     22                pre_processor_parameters=pre_processor_parameters,
     23                vectorizer_parameters=vectorizer_parameters,
---> 24                estimator_parameters=estimator_parameters)

/home/liconj/proj/thesis/EDeN/eden/model.py in optimize(self, iterable_pos, iterable_neg, model_name, n_active_learning_iterations, size_positive, size_negative, lower_bound_threshold_positive, upper_bound_threshold_positive, lower_bound_threshold_negative, upper_bound_threshold_negative, n_iter, n_inner_iter_estimator, max_total_time, pre_processor_parameters, vectorizer_parameters, estimator_parameters, cv, scoring, score_func, two_steps_optimization)
    229                     if True:#try:
    230                         if n_active_learning_iterations == 0:  # if no active learning mode, just produce data matrix
--> 231                             X, y = self._data_matrices(iterable_pos_, iterable_neg_, fit_vectorizer=self.fit_vectorizer)
    232                         else:  # otherwise use the active learning strategy
    233                             X, y = self._active_learning_data_matrices(iterable_pos_, iterable_neg_,

/home/liconj/proj/thesis/EDeN/eden/model.py in _data_matrices(self, iterable_pos, iterable_neg, fit_vectorizer)
    336 
    337     def _data_matrices(self, iterable_pos, iterable_neg, fit_vectorizer=False):
--> 338         data_matrix_pos = self._data_matrix(iterable_pos, fit_vectorizer=fit_vectorizer)
    339         data_matrix_neg = self._data_matrix(iterable_neg, fit_vectorizer=False)
    340         return self._assemble_data_matrix(data_matrix_pos, data_matrix_neg)

/home/liconj/proj/thesis/EDeN/eden/model.py in _data_matrix(self, iterable, fit_vectorizer)
    332         if fit_vectorizer:
    333             self.vectorizer.fit(graphs_)
--> 334         data_matrix = vectorize(graphs, vectorizer=self.vectorizer, n_jobs=self.n_jobs, n_blocks=self.n_blocks)
    335         return data_matrix
    336 

/home/liconj/proj/thesis/EDeN/eden/util/__init__.pyc in vectorize(graphs, vectorizer, fit_flag, n_blocks, block_size, n_jobs)
    170         raise Exception("Cannot perform fit in parallel: set n_jobs to 1")
    171     if n_jobs == 1:
--> 172         return serial_vectorize(graphs, vectorizer=vectorizer, fit_flag=fit_flag)
    173     else:
    174         return multiprocess_vectorize(graphs, vectorizer=vectorizer, n_blocks=n_blocks, block_size=block_size, n_jobs=n_jobs)

/home/liconj/proj/thesis/EDeN/eden/util/__init__.pyc in serial_vectorize(graphs, vectorizer, fit_flag)
    145         data_matrix = vectorizer.fit_transform(graphs)
    146     else:
--> 147         data_matrix = vectorizer.transform(graphs)
    148     return data_matrix
    149 

/home/liconj/proj/thesis/EDeN/eden/graph.pyc in transform(self, graphs)
    191         for instance_id, G in enumerate(graphs):
    192             self._test_goodness(G)
--> 193             feature_dict.update(self._transform(instance_id, G))
    194         if instance_id is None:
    195             raise Exception('ERROR: something went wrong, no graphs are present in current iterator.')

/home/liconj/proj/thesis/EDeN/eden/graph.pyc in _transform(self, instance_id, original_graph)
    487 
    488     def _transform(self, instance_id, original_graph):
--> 489         graph = self._graph_preprocessing(original_graph)
    490         # collect all features for all vertices for each label_index
    491         feature_list = defaultdict(lambda: defaultdict(float))

/home/liconj/proj/thesis/EDeN/eden/graph.pyc in _graph_preprocessing(self, original_graph)
    481         self._label_preprocessing(graph)
    482         self._compute_distant_neighbours(graph, max(self.r, self.d))
--> 483         self._compute_neighborhood_graph_hash_cache(graph)
    484         if graph.graph.get('weighted', False):
    485             self._compute_neighborhood_graph_weight_cache(graph)

/home/liconj/proj/thesis/EDeN/eden/graph.pyc in _compute_neighborhood_graph_hash_cache(self, graph)
    573         for u, d in graph.nodes_iter(data=True):
    574             if d.get('node', False):
--> 575                 self._compute_neighborhood_graph_hash(u, graph)
    576 
    577     def _compute_neighborhood_graph_hash(self, root, graph):

/home/liconj/proj/thesis/EDeN/eden/graph.pyc in _compute_neighborhood_graph_hash(self, root, graph)
    587                 hash_label_list = []
    588                 for v in node_set:
--> 589                     vhlabel = graph.node[v]['hlabel'][label_index]
    590                     hash_label_list.append(vhlabel)
    591                 # sort it

KeyError: 'hlabel'

In [ ]:
%%time
# Estimate predictive performance
model3d.estimate( iterable_pos_test, iterable_neg_test )

In [ ]:
def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)

    predictions= model.decision_function( iterable )

    return predictions