In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from eden.converter.molecule import obabel
import logging
logging.basicConfig(filename="example.log")

In [3]:
import requests

def get_compounds(fname, size, listkey):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    with open(fname,'w') as file_handle:
        stepsize=50
        index_start=0
        for chunk, index_end in enumerate(range(0,size+stepsize,stepsize)):
            if index_end is not 0 :
                print 'Chunk %s) Processing compounds %s to %s (of a total of %s)' % (chunk, index_start, index_end-1, size)
                RESTQ = PROLOG + 'compound/listkey/' + str(listkey) + '/SDF?&listkey_start=' + str(index_start) + '&listkey_count=' + str(stepsize)
                reply=requests.get(RESTQ)
                file_handle.write(reply.text)
            index_start = index_end
        print 'compounds available in file: ', fname


def get_assay(assay_id):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    AID=str(assay_id)
    #active
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=active&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    active_listkey = reply.json()['IdentifierList']['ListKey']
    active_size = reply.json()['IdentifierList']['Size'] 
    active_fname = 'data/AID'+AID+'_active.sdf'
    get_compounds(fname=active_fname, size=active_size, listkey=active_listkey)

    #inactive
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=inactive&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    inactive_listkey = reply.json()['IdentifierList']['ListKey']
    inactive_size = reply.json()['IdentifierList']['Size']
    inactive_fname = 'data/AID'+AID+'_inactive.sdf'
    get_compounds(fname=inactive_fname, size=inactive_size, listkey=inactive_listkey)

    return (active_fname,inactive_fname)

In [4]:
import datetime, time
def train_obabel_model(pos_fname, neg_fname, model_fname=None, n_iter=40, active_set_size=1000, n_active_learning_iterations=3, threshold=1, train_test_split=0.7, verbose=False):
    
    
    def pre_processor( data, **args):
        return data
    
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable_pos=obabel.obabel_to_eden(pos_fname)
    iterable_neg=obabel.obabel_to_eden(neg_fname)
    
    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    # model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )
    model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                estimator=estimator,
                                                vectorizer=vectorizer,
                                                n_jobs = 2,
                                                n_blocks = 10,
                                                fit_vectorizer=True)
 
    from numpy.random import randint
    from numpy.random import uniform

    pre_processor_parameters={'model_type':'default'} 
    
    # The training time for this model is much smaller, so we can use various iterations of the
    # vectorizer
    vectorizer_parameters={'complexity':[2,3,4,5,6]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
    
    #estimate predictive performance
    model.estimate( iterable_pos_test, iterable_neg_test)
    return model
    
    
def test_obabel_model(fname, model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable=obabel.obabel_to_eden(fname)
    
    predictions= model.decision_function( iterable )
        
    return predictions

In [5]:
AID=720577
#AID=2801

In [7]:
%%time

#READ_FROM_FILE=False
READ_FROM_FILE=True

if READ_FROM_FILE:
    active_fname='data/AID%s_active.sdf'%AID
    inactive_fname='data/AID%s_inactive.sdf'%AID
else:
    active_fname, inactive_fname = get_assay(AID)


CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 13.8 µs

In [8]:
%%time

model_fname='models/AID%s.default_model'%AID
fitted_model = train_obabel_model(active_fname, inactive_fname, model_fname=model_fname, 
                           n_iter=20, 
                           active_set_size=0, 
                           n_active_learning_iterations=0, 
                           threshold=1, 
                           train_test_split=0.7, 
                           verbose=1)


# positives: 0  # negatives: 146 (0.5 sec 0:00:00.457108)
WARNING:eden.model:ERROR: no iteration has produced any viable solution. The model produced is unusable.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-0be4c174f29e> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"\nmodel_fname='models/AID%s.default_model'%AID\nfitted_model = train_obabel_model(active_fname, inactive_fname, model_fname=model_fname, \n                           n_iter=20, \n                           active_set_size=0, \n                           n_active_learning_iterations=0, \n                           threshold=1, \n                           train_test_split=0.7, \n                           verbose=1)")

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2262             magic_arg_s = self.var_expand(line, stack_depth)
   2263             with self.builtin_trap:
-> 2264                 result = fn(magic_arg_s, cell)
   2265             return result
   2266 

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

<ipython-input-4-eea6e98cf83d> in train_obabel_model(pos_fname, neg_fname, model_fname, n_iter, active_set_size, n_active_learning_iterations, threshold, train_test_split, verbose)
     71 
     72     #estimate predictive performance
---> 73     model.estimate( iterable_pos_test, iterable_neg_test)
     74     return model
     75 

/home/liconj/proj/thesis/EDeN/eden/model.pyc in estimate(self, iterable_pos, iterable_neg)
    101 
    102     def estimate(self, iterable_pos, iterable_neg):
--> 103         data_matrix, y = self._data_matrices(iterable_pos, iterable_neg, fit_vectorizer=False)
    104         margins = self.estimator.decision_function(data_matrix)
    105         predictions = self.estimator.predict(data_matrix)

/home/liconj/proj/thesis/EDeN/eden/model.pyc in _data_matrices(self, iterable_pos, iterable_neg, fit_vectorizer)
    336 
    337     def _data_matrices(self, iterable_pos, iterable_neg, fit_vectorizer=False):
--> 338         data_matrix_pos = self._data_matrix(iterable_pos, fit_vectorizer=fit_vectorizer)
    339         data_matrix_neg = self._data_matrix(iterable_neg, fit_vectorizer=False)
    340         return self._assemble_data_matrix(data_matrix_pos, data_matrix_neg)

/home/liconj/proj/thesis/EDeN/eden/model.pyc in _data_matrix(self, iterable, fit_vectorizer)
    327                                 n_blocks=self.pre_processor_n_blocks,
    328                                 block_size=self.pre_processor_block_size,
--> 329                                 n_jobs=self.pre_processor_n_jobs)
    330         graphs, graphs_ = tee(graphs)
    331         self.vectorizer.set_params(**self.vectorizer_args)

/home/liconj/proj/thesis/EDeN/eden/util/__init__.pyc in mp_pre_process(iterable, pre_processor, pre_processor_args, n_blocks, block_size, n_jobs)
    138         return pre_processor(iterable, **pre_processor_args)
    139     else:
--> 140         return multiprocess_pre_process(iterable, pre_processor=pre_processor, pre_processor_args=pre_processor_args, n_blocks=n_blocks, block_size=block_size, n_jobs=n_jobs)
    141 
    142 

/home/liconj/proj/thesis/EDeN/eden/util/__init__.pyc in multiprocess_pre_process(iterable, pre_processor, pre_processor_args, n_blocks, block_size, n_jobs)
    124         pool = mp.Pool(n_jobs)
    125     results = [apply_async(pool, serial_pre_process, args=(iterable[start:end], pre_processor, pre_processor_args)) for start, end in intervals]
--> 126     output = [p.get() for p in results]
    127     pool.close()
    128     pool.join()

/usr/lib64/python2.7/multiprocessing/pool.pyc in get(self, timeout)
    552             return self._value
    553         else:
--> 554             raise self._value
    555 
    556     def _set(self, i, obj):

TypeError: 'NoneType' object is not callable

In [ ]:
print fitted_model.get_parameters()

In [ ]:
from eden.converter.molecule import obabel
graphs=obabel.obabel_to_eden(active_fname,file_type = 'sdf')
from itertools import islice
graphs = islice(graphs, 3)
from eden.util.display import draw_graph
for graph in graphs:  draw_graph(graph, size=12, node_size=400, node_border=1, vertex_label='hlabel')