In [1]:
import requests

def get_compounds(fname, size, listkey):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    with open(fname,'w') as file_handle:
        stepsize=50
        index_start=0
        for chunk, index_end in enumerate(range(0,size+stepsize,stepsize)):
            if index_end is not 0 :
                print 'Chunk %s) Processing compounds %s to %s (of a total of %s)' % (chunk, index_start, index_end-1, size)
                RESTQ = PROLOG + 'compound/listkey/' + str(listkey) + '/SDF?&listkey_start=' + str(index_start) + '&listkey_count=' + str(stepsize)
                reply=requests.get(RESTQ)
                file_handle.write(reply.text)
            index_start = index_end
        print 'compounds available in file: ', fname


def get_assay(assay_id):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    AID=str(assay_id)
    #active
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=active&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    active_listkey = reply.json()['IdentifierList']['ListKey']
    active_size = reply.json()['IdentifierList']['Size'] 
    active_fname = 'AID'+AID+'_active.sdf'
    get_compounds(fname=active_fname, size=active_size, listkey=active_listkey)

    #inactive
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=inactive&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    inactive_listkey = reply.json()['IdentifierList']['ListKey']
    inactive_size = reply.json()['IdentifierList']['Size']
    inactive_fname = 'AID'+AID+'_inactive.sdf'
    get_compounds(fname=inactive_fname, size=inactive_size, listkey=inactive_listkey)

    return (active_fname,inactive_fname)

In [2]:
import datetime, time
def train_obabel_model(pos_fname, neg_fname, model_fname=None, n_iter=40, active_set_size=1000, n_active_learning_iterations=3, threshold=1, train_test_split=0.7, verbose=False):
    def pre_processor( data, **args):
        return data
    
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable_pos=obabel.obabel_to_eden(pos_fname)
    iterable_neg=obabel.obabel_to_eden(neg_fname)
    
    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )

    #optimize hyperparameters and fit model
    from numpy.random import randint
    from numpy.random import uniform

    pre_processor_parameters={} 

    vectorizer_parameters={'complexity':[4]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3, verbose=verbose,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
  
    #estimate predictive performance
    model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
    return model
    
    
def test_obabel_model(fname, model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable=obabel.obabel_to_eden(fname)
    
    predictions= model.decision_function( iterable )
        
    return predictions

In [3]:
AID=488918
AID=738
AID=743150
AID=2401

In [4]:
%%time

READ_FROM_FILE=False
READ_FROM_FILE=True

if READ_FROM_FILE:
    active_fname='AID%s_active.sdf'%AID
    inactive_fname='AID%s_inactive.sdf'%AID
else:
    active_fname, inactive_fname = get_assay(AID)


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.11 µs

In [ ]:
%%time
model_fname='AID%s.model'%AID
model = train_obabel_model(active_fname, inactive_fname, model_fname=model_fname, 
                           n_iter=40, 
                           active_set_size=500, 
                           n_active_learning_iterations=4, 
                           threshold=1, 
                           train_test_split=0.8, 
                           verbose=1)

In [ ]:
from eden.converter.molecule import obabel
graphs=obabel.obabel_to_eden(active_fname,file_type = 'sdf')
from itertools import islice
graphs = islice(graphs, 3)
from eden.util.display import draw_graph
for graph in graphs:  draw_graph(graph, size=12, node_size=400, node_border=1, vertex_label='hlabel')

Old


In [5]:
from eden.graph import Vectorizer
vectorizer=Vectorizer(complexity=5, nbits=14)
from eden.converter.molecule import obabel
print 'Working on positive instances in %s and negative instances in %s' % (active_fname, inactive_fname)
active_graphs=obabel.obabel_to_eden(active_fname)
inactive_graphs=obabel.obabel_to_eden(inactive_fname)


Working on positive instances in AID2401_active.sdf and negative instances in AID2401_inactive.sdf

In [6]:
from eden.util import fit
fit(active_graphs,inactive_graphs, vectorizer)


Classifier:
SGDClassifier(alpha=0.000244367424549, average=True, class_weight='auto',
       epsilon=0.1, eta0=0.391312945136, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='hinge', n_iter=29, n_jobs=-1,
       penalty='l2', power_t=0.163214708985, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.844 +- 0.051
           precision: 0.726 +- 0.133
              recall: 0.591 +- 0.119
                  f1: 0.642 +- 0.102
   average_precision: 0.722 +- 0.143
             roc_auc: 0.867 +- 0.072
--------------------------------------------------------------------------------
Out[6]:
SGDClassifier(alpha=0.000244367424549, average=True, class_weight='auto',
       epsilon=0.1, eta0=0.391312945136, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='hinge', n_iter=29, n_jobs=-1,
       penalty='l2', power_t=0.163214708985, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

In [8]:
%%time
from eden import vectorize
active_X = vectorize(active_graphs,vectorizer)
inactive_X = vectorize(inactive_graphs,vectorizer)
from scipy.sparse import vstack
import numpy as np
X=vstack( [active_X,inactive_X] )
yp=[1]*active_X.shape[0]
yn=[-1]*inactive_X.shape[0]
y=np.array(yp+yn)


CPU times: user 8.13 s, sys: 875 ms, total: 9 s
Wall time: 26.6 s

In [9]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation

#induce a predictive model
predictor = SGDClassifier(class_weight = 'auto', shuffle = True, average=True)
scores = cross_validation.cross_val_score(predictor, X, y,cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))


AUC ROC: 0.8647 +- 0.0742
CPU times: user 719 ms, sys: 148 ms, total: 867 ms
Wall time: 866 ms

In [10]:
%%time
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, random_state=42)
Xd=svd.fit_transform(X) 
print(svd.explained_variance_ratio_.sum())


0.534820064485
CPU times: user 15.9 s, sys: 443 ms, total: 16.3 s
Wall time: 13.3 s

In [11]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation

#induce a predictive model
predictor = SGDClassifier(class_weight = 'auto', shuffle = True, average=True)
scores = cross_validation.cross_val_score(predictor, Xd, y,cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))


AUC ROC: 0.8606 +- 0.0690
CPU times: user 202 ms, sys: 24.2 ms, total: 226 ms
Wall time: 226 ms

In [ ]: