In [1]:
import requests
def get_compounds(fname, size, listkey):
PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
with open(fname,'w') as file_handle:
stepsize=50
index_start=0
for chunk, index_end in enumerate(range(0,size+stepsize,stepsize)):
if index_end is not 0 :
print 'Chunk %s) Processing compounds %s to %s (of a total of %s)' % (chunk, index_start, index_end-1, size)
RESTQ = PROLOG + 'compound/listkey/' + str(listkey) + '/SDF?&listkey_start=' + str(index_start) + '&listkey_count=' + str(stepsize)
reply=requests.get(RESTQ)
file_handle.write(reply.text)
index_start = index_end
print 'compounds available in file: ', fname
def get_assay(assay_id):
PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
AID=str(assay_id)
#active
RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=active&list_return=listkey'
reply=requests.get(RESTQ)
#extract the listkey
active_listkey = reply.json()['IdentifierList']['ListKey']
active_size = reply.json()['IdentifierList']['Size']
active_fname = 'AID'+AID+'_active.sdf'
get_compounds(fname=active_fname, size=active_size, listkey=active_listkey)
#inactive
RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=inactive&list_return=listkey'
reply=requests.get(RESTQ)
#extract the listkey
inactive_listkey = reply.json()['IdentifierList']['ListKey']
inactive_size = reply.json()['IdentifierList']['Size']
inactive_fname = 'AID'+AID+'_inactive.sdf'
get_compounds(fname=inactive_fname, size=inactive_size, listkey=inactive_listkey)
return (active_fname,inactive_fname)
In [2]:
import datetime, time
def train_obabel_model(pos_fname, neg_fname, model_fname=None, n_iter=40, active_set_size=1000, n_active_learning_iterations=3, threshold=1, train_test_split=0.7, verbose=False):
def pre_processor( data, **args):
return data
from eden.graph import Vectorizer
vectorizer = Vectorizer()
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)
#create iterable from files
from eden.converter.molecule import obabel
iterable_pos=obabel.obabel_to_eden(pos_fname)
iterable_neg=obabel.obabel_to_eden(neg_fname)
from itertools import tee
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)
import time
start = time.time()
print('# positives: %d # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={}
vectorizer_parameters={'complexity':[4]}
estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,-2)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"]}
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
n_active_learning_iterations=n_active_learning_iterations,
size_positive=-1,
size_negative=active_set_size,
n_iter=n_iter, cv=3, verbose=verbose,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
#estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
return model
def test_obabel_model(fname, model_fname=None):
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel()
model.load(model_fname)
#create iterable from files
from eden.converter.molecule import obabel
iterable=obabel.obabel_to_eden(fname)
predictions= model.decision_function( iterable )
return predictions
In [3]:
AID=488918
AID=738
AID=743150
AID=2401
In [4]:
%%time
READ_FROM_FILE=False
READ_FROM_FILE=True
if READ_FROM_FILE:
active_fname='AID%s_active.sdf'%AID
inactive_fname='AID%s_inactive.sdf'%AID
else:
active_fname, inactive_fname = get_assay(AID)
In [ ]:
%%time
model_fname='AID%s.model'%AID
model = train_obabel_model(active_fname, inactive_fname, model_fname=model_fname,
n_iter=40,
active_set_size=500,
n_active_learning_iterations=4,
threshold=1,
train_test_split=0.8,
verbose=1)
In [ ]:
from eden.converter.molecule import obabel
graphs=obabel.obabel_to_eden(active_fname,file_type = 'sdf')
from itertools import islice
graphs = islice(graphs, 3)
from eden.util.display import draw_graph
for graph in graphs: draw_graph(graph, size=12, node_size=400, node_border=1, vertex_label='hlabel')
In [5]:
from eden.graph import Vectorizer
vectorizer=Vectorizer(complexity=5, nbits=14)
from eden.converter.molecule import obabel
print 'Working on positive instances in %s and negative instances in %s' % (active_fname, inactive_fname)
active_graphs=obabel.obabel_to_eden(active_fname)
inactive_graphs=obabel.obabel_to_eden(inactive_fname)
In [6]:
from eden.util import fit
fit(active_graphs,inactive_graphs, vectorizer)
Out[6]:
In [8]:
%%time
from eden import vectorize
active_X = vectorize(active_graphs,vectorizer)
inactive_X = vectorize(inactive_graphs,vectorizer)
from scipy.sparse import vstack
import numpy as np
X=vstack( [active_X,inactive_X] )
yp=[1]*active_X.shape[0]
yn=[-1]*inactive_X.shape[0]
y=np.array(yp+yn)
In [9]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
#induce a predictive model
predictor = SGDClassifier(class_weight = 'auto', shuffle = True, average=True)
scores = cross_validation.cross_val_score(predictor, X, y,cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))
In [10]:
%%time
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, random_state=42)
Xd=svd.fit_transform(X)
print(svd.explained_variance_ratio_.sum())
In [11]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation
#induce a predictive model
predictor = SGDClassifier(class_weight = 'auto', shuffle = True, average=True)
scores = cross_validation.cross_val_score(predictor, Xd, y,cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))
In [ ]: