In [1]:
%matplotlib inline
import eden
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging
logger = logging.getLogger()
In [2]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
import datetime
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,estimate
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from eden.converter.molecule.obabel import mol_file_to_iterable
from eden.converter.molecule.obabel import obabel_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=None):
iterable = mol_file_to_iterable(dataset_fname, file_format='smi')
graphs = obabel_to_eden(iterable, file_format='smi')
return islice(graphs,size)
In [3]:
#rename to pre_processor and expose all relevant parameters for optimization
def generate_sample(graphs,
random_state=42,
complexity=5,
nu=0.25,
radius_list=[0,1],
thickness_list=[2,3],
n_steps=5,
n_samples=4,
burnin=1,
improving_threshold=0.25,
max_core_size_diff=3):
graphs, graphs_ = tee(graphs)
sampler=GraphLearnSampler(radius_list=radius_list,thickness_list=thickness_list,
min_cip_count=2, min_interface_count=2,
vectorizer=Vectorizer(complexity), random_state=random_state)
sampler.fit(graphs, nu=nu, n_jobs=-1)
logger.info('graph grammar stats:')
dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar().size()
logger.info('#instances:%d #interfaces: %d #cores: %d #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))
graphs = sampler.sample(graphs_,
n_steps=n_steps,
n_samples=n_samples,
target_orig_cip=True,
probabilistic_core_choice=False,
score_core_choice= False,
max_core_size_diff=max_core_size_diff,
burnin=burnin,
omit_seed=True,
max_cycle_size=6,
improving_threshold=improving_threshold,
accept_static_penalty=0,
n_jobs=-1,
select_cip_max_tries=200,
keep_duplicates=False,
generator_mode=True)
return graphs
In [4]:
def constructive_model(pos_fname, neg_fname, size=None, model_fname=None, n_iter=40, train_test_split=0.7, random_state=42):
def pre_processor( graphs, **args):
graphs = generate_sample(graphs, **args)
return graphs
from eden.graph import Vectorizer
vectorizer = Vectorizer()
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)
#create iterable from files
iterable_pos= get_graphs(pos_fname, size=size)
iterable_neg= get_graphs(neg_fname, size=size)
from itertools import tee
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)
import time
start = time.time()
logger.info('-'*80)
logger.info('Dataset')
logger.info('# positives: %d # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
#make predictive model
#NOTE: since parallelization cannot happen in a nested way, and since the graph learn already parallelize, we avoid
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor,
estimator=estimator,
vectorizer=vectorizer,
pre_processor_n_jobs=1,
random_state=random_state)
#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'complexity':[3,5],
'nu':[0.1,0.25,0.33,0.5],
'radius_list':[[0,1,2]],
'thickness_list':[[1,2],[2],[2,3]],
'n_steps':[5,7,9],
'n_samples':[2,4],
'burnin':[0,1,2],
'improving_threshold':[0.25,0.33,0.5],
'max_core_size_diff':[0,1,2,3],
'random_state':[random_state]}
vectorizer_parameters={'complexity':[3,4,5]}
estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
'penalty':['l1','l2','elasticnet'],
'l1_ratio':uniform(0.1,0.9, size=n_iter),
'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
'power_t':uniform(0.1, size=n_iter),
'alpha': [10**x for x in range(-8,-2)],
'eta0': [10**x for x in range(-4,-1)],
'learning_rate': ["invscaling", "constant", "optimal"]}
logger.info('-'*80)
logger.info('Choosing from parameters:')
from eden.util import serialize_dict
logger.info(serialize_dict(pre_processor_parameters))
logger.info(serialize_dict(vectorizer_parameters))
logger.info(serialize_dict(estimator_parameters))
logger.info('-'*80)
model.optimize(iterable_pos_train, iterable_neg_train,
model_name=model_fname,
n_iter=n_iter,
pre_processor_parameters=pre_processor_parameters,
vectorizer_parameters=vectorizer_parameters,
estimator_parameters=estimator_parameters)
#estimate predictive performance on original data, i.e. without sampling
logger.info('-'*80)
logger.info('Parameters:')
opt_params = model.get_parameters()
logger.info(opt_params)
opt_vectorizer = model.get_vectorizer()
opt_estimator = model.get_estimator()
from eden.util import estimate
apr, roc = estimate(iterable_pos_test, iterable_neg_test,
estimator=opt_estimator,
vectorizer=opt_vectorizer)
return model
In [5]:
configure_logging(logger,verbosity=1)
In [ ]:
%%time
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
model = constructive_model(pos_fname, neg_fname, size=200, model_fname='bursi',
n_iter=40, train_test_split=0.5, random_state=2)
In [6]:
%%time
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
model = constructive_model(pos_fname, neg_fname, size=100, model_fname='bursi', n_iter=5, train_test_split=0.5)
In [8]:
%%time
#explicit experiment
start_global = time()
#train a model on data, then test it on original data (different from the mols that generated the data) and compare
from eden.graph import Vectorizer
vectorizer=Vectorizer(5)
#setup
size=100
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
iterable_pos= get_graphs(pos_fname, size=size)
iterable_neg= get_graphs(neg_fname, size=size)
random_state=42
train_test_split=.7
#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
args = {'random_state':42,
'complexity':5,
'nu':0.1,
'radius_list':[0,1],
'thickness_list':[1,2],
'n_steps':5,
'n_samples':4,
'burnin':1,
'improving_threshold':0.25,
'max_core_size_diff':0}
logger.info('-'*80)
logger.info('Params:')
from eden.util import serialize_dict
logger.info(serialize_dict(args))
#train
start = time()
logger.info('-'*80)
logger.info('Grammar induction:')
logger.info('Positives:')
sampled_pos = generate_sample(iterable_pos_train, **args)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
start = time()
logger.info('Negatives:')
sampled_neg = generate_sample(iterable_neg_train, **args)
print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
start = time()
logger.info('-'*80)
logger.info('Fitting:')
from eden.util import fit
estimator = fit(sampled_pos,
sampled_neg,
vectorizer,
fit_flag=False,
n_jobs=-1,
cv=10,
n_iter_search=5,
random_state=1,
block_size=100)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
#test
start = time()
logger.info('-'*80)
logger.info('Testing:')
from eden.util import estimate
apr, roc = estimate(iterable_pos_test,
iterable_neg_test,
estimator,
vectorizer,
block_size=100,
n_jobs=-1)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
logger.info('Global time elapsed: %s'%(datetime.timedelta(seconds=(time() - start_global))))
.