In [8]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [9]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path
Get the data from PubChem server:

In [10]:
%%time
AID=1
#AID=2401
#DATA_DIR = '/Volumes/seagate/thesis/examples/data'
DATA_DIR = '/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data'
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID


CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 17.9 µs

In [11]:
active_fname


Out[11]:
'/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data/AID1_active.sdf'

In [12]:
def make_iterable(filename, file_format):
    if file_format == 'sdf':
        with open(filename) as f:
            s = ''
            for line in f:
                if line.strip() != '$$$$':
                    s = s + line
                else:
                    return_value = s + line
                    s = ''
                    yield return_value
    elif file_format == 'smi':
        with open(filename) as f:
            for line in f:
                yield line

Functions for training and testing the model


In [13]:
import datetime, time
def train_obabel_model(iterable_pos, iterable_neg, pre_processor_parameters, data_dir,
                       model_type = "default",
                       model_fname=None, n_iter=40, active_set_size=1000,
                       n_active_learning_iterations=3, threshold=1, train_test_split=0.7,
                       verbose=False):

    from numpy.random import randint
    from numpy.random import uniform


    global_cache = {}

    # this will be passed as an argument to the model later on
    def pre_processor(data, model_type="3d", **kwargs):

        #### Use the model_type variable from outside (?) ####
        # model_type = kwargs.get('mode', 'default')
        if model_type == "default":
            iterable = obabel.obabel_to_eden(data, **kwargs)
        elif model_type == "3d":
            iterable = obabel.obabel_to_eden3d(data, cache=global_cache, **kwargs)
        return iterable

    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle=True)

    #######3
    #create iterable from files
    ########

    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)

    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))

    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)

    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                    estimator=estimator,
                                                    vectorizer=vectorizer,
                                                    n_jobs=2,
                                                    n_blocks = 10,
                                                    fit_vectorizer=True)

    #optimize hyperparameters and fit model

    #print "pre processor parameters: " + str(pre_processor_parameters)
    vectorizer_parameters={'complexity':[2,3,4],
                           'discretization_size':randint(2, 3,size=n_iter),
                           'discretization_dimension':randint(2, 3,size=n_iter)}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter),
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    print "calling optimizer.."
    model.optimize(iterable_pos_train, iterable_neg_train,
                   model_name=model_fname,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3, verbose=verbose,
                   pre_processor_parameters=pre_processor_parameters,
                   vectorizer_parameters=vectorizer_parameters,
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    #model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
    # Had to change this call, estimate has no cv parameter
    model.estimate( iterable_pos_test, iterable_neg_test )

    return model

def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)

    predictions= model.decision_function( iterable )

    return predictions

Train the models

3D model - no extra conformers


In [14]:
%%time
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[0]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=1)


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-14-0123fe202ffc> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'from numpy.random import randint\nfrom numpy.random import uniform\n\npos_iterator=make_iterable(active_fname, \'sdf\')\nneg_iterator=make_iterable(inactive_fname, \'sdf\')\n\nmodel_fname=DATA_DIR + \'/AID%s.model3d\'%AID\n\nn_iter = 5\npre_processor_parameters={\'k\':randint(1, 10,size=n_iter),\n                          \'threshold\':randint(1, 10, size=n_iter),\n                          \'model_type\':[\'3d\'],\n                          \'n_conf\':[0]}\n\nmodel = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,\n                           data_dir=DATA_DIR,\n                           model_type = "3d",\n                           model_fname=model_fname,\n                           n_iter=5,\n                           active_set_size=5,\n                           n_active_learning_iterations=0,\n                           threshold=1,\n                           train_test_split=0.8,\n                           verbose=1)')

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2262             magic_arg_s = self.var_expand(line, stack_depth)
   2263             with self.builtin_trap:
-> 2264                 result = fn(magic_arg_s, cell)
   2265             return result
   2266 

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

<ipython-input-13-18296af7e14f> in train_obabel_model(iterable_pos, iterable_neg, pre_processor_parameters, data_dir, model_type, model_fname, n_iter, active_set_size, n_active_learning_iterations, threshold, train_test_split, verbose)
     39     import time
     40     start = time.time()
---> 41     print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
     42 
     43     iterable_pos, iterable_pos_ = tee(iterable_pos)

<ipython-input-13-18296af7e14f> in <genexpr>(***failed resolving arguments***)
     39     import time
     40     start = time.time()
---> 41     print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
     42 
     43     iterable_pos, iterable_pos_ = tee(iterable_pos)

<ipython-input-12-a9eb51c29e7d> in make_iterable(filename, file_format)
      1 def make_iterable(filename, file_format):
      2     if file_format == 'sdf':
----> 3         with open(filename) as f:
      4             s = ''
      5             for line in f:

IOError: [Errno 2] No such file or directory: '/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data/AID1_active.sdf'

3D model - with conformers


In [ ]:
%%time
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[10]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=1)

In [ ]:
AID=2401
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID

In [ ]:
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[10]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=2)

Test the models:


In [ ]:
# active_X, inactive_X are created as above (data matrices) with vectorize(...)
active_X
inactive_X
from eden.util import fit
fit(active_X, inactive_X, vectorizer)
# do transform on both, fit only on positive
### next meet wednesday 16.30