In [1]:
%matplotlib inline
import eden
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging
logger = logging.getLogger()

In [2]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
import datetime
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,estimate
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from eden.converter.molecule.obabel import mol_file_to_iterable
from eden.converter.molecule.obabel import obabel_to_eden
from itertools import islice

def get_graphs(dataset_fname, size=None):
    iterable = mol_file_to_iterable(dataset_fname, file_format='smi')
    graphs = obabel_to_eden(iterable, file_format='smi')
    return islice(graphs,size)

In [3]:
#rename to pre_processor and expose all relevant parameters for optimization

def generate_sample(graphs,
                    random_state=42,
                    complexity=5,
                    nu=0.25,
                    radius_list=[0,1],
                    thickness_list=[2,3],
                    n_steps=5,
                    n_samples=4,
                    burnin=1,
                    improving_threshold=0.25,
                    max_core_size_diff=3):
    graphs, graphs_ = tee(graphs)
    sampler=GraphLearnSampler(radius_list=radius_list,thickness_list=thickness_list,
                              min_cip_count=2, min_interface_count=2,
                              vectorizer=Vectorizer(complexity), random_state=random_state)
    
    sampler.fit(graphs, nu=nu, n_jobs=-1)

    logger.info('graph grammar stats:')
    dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar().size()
    logger.info('#instances:%d   #interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts))
    graphs = sampler.sample(graphs_,
                            n_steps=n_steps, 
                            n_samples=n_samples,
                            target_orig_cip=True,
                            probabilistic_core_choice=False,
                            score_core_choice= False,
                            max_core_size_diff=max_core_size_diff,
                            burnin=burnin,
                            omit_seed=True,
                            max_cycle_size=6,
                            improving_threshold=improving_threshold,
                            accept_static_penalty=0,
                            n_jobs=-1,
                            select_cip_max_tries=200,
                            keep_duplicates=False,
                            generator_mode=True)
    return graphs

In [4]:
def constructive_model(pos_fname, neg_fname, size=None, model_fname=None, n_iter=40, train_test_split=0.7, random_state=42):
    def pre_processor( graphs, **args):
        graphs = generate_sample(graphs, **args)
        return graphs
    
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

    #create iterable from files
    iterable_pos= get_graphs(pos_fname, size=size)
    iterable_neg= get_graphs(neg_fname, size=size)


    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    logger.info('-'*80)
    logger.info('Dataset')
    logger.info('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    #NOTE: since parallelization cannot happen in a nested way, and since the graph learn already parallelize, we avoid 
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                    estimator=estimator,
                                                    vectorizer=vectorizer,
                                                    pre_processor_n_jobs=1,
                                                    random_state=random_state)

    #optimize hyperparameters and fit model
    from numpy.random import randint
    from numpy.random import uniform

    pre_processor_parameters={'complexity':[3,5],
                              'nu':[0.1,0.25,0.33,0.5],
                              'radius_list':[[0,1,2]],
                              'thickness_list':[[1,2],[2],[2,3]],
                              'n_steps':[5,7,9],
                              'n_samples':[2,4],
                              'burnin':[0,1,2],
                              'improving_threshold':[0.25,0.33,0.5],
                              'max_core_size_diff':[0,1,2,3],
                              'random_state':[random_state]} 

    vectorizer_parameters={'complexity':[3,4,5]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    logger.info('-'*80)
    logger.info('Choosing from parameters:')
    from eden.util import serialize_dict
    logger.info(serialize_dict(pre_processor_parameters))
    logger.info(serialize_dict(vectorizer_parameters))
    logger.info(serialize_dict(estimator_parameters))
    logger.info('-'*80)

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   n_iter=n_iter,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
  
    #estimate predictive performance on original data, i.e. without sampling
    logger.info('-'*80)
    logger.info('Parameters:')
    opt_params = model.get_parameters()
    logger.info(opt_params)
    opt_vectorizer = model.get_vectorizer()
    opt_estimator = model.get_estimator()
    from eden.util import estimate
    apr, roc = estimate(iterable_pos_test, iterable_neg_test,
                        estimator=opt_estimator,
                        vectorizer=opt_vectorizer)
    return model

Experimental pipeline


In [5]:
configure_logging(logger,verbosity=1)

In [ ]:
%%time
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
model = constructive_model(pos_fname, neg_fname, size=200, model_fname='bursi',
                           n_iter=40, train_test_split=0.5, random_state=2)


--------------------------------------------------------------------------------
Dataset
# positives: 200  # negatives: 200 (0.3 sec 0:00:00.306129)
--------------------------------------------------------------------------------
Choosing from parameters:
    burnin: [0, 1, 2]
complexity: [3, 5]
improving_threshold: [0.25, 0.33, 0.5]
max_core_size_diff: [0, 1, 2, 3]
 n_samples: [2, 4]
   n_steps: [5, 7, 9]
        nu: [0.1, 0.25, 0.33, 0.5]
radius_list: [[0, 1, 2]]
random_state: [2]
thickness_list: [[1, 2], [2], [2, 3]]
complexity: [3, 4, 5]
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.31246886  0.56769089  0.14766148  0.43874078  0.86867181  0.37064723
  0.406477    0.33693454  0.54560713  0.1560296   0.7785867   0.27993394
  0.56328043  0.41195084  0.66254261  0.49290766  0.62098001  0.61171872
  0.63882845  0.7668905   0.68173755  0.65347872  0.51550808  0.28036526
  0.66254283  0.21045886  0.24640558  0.18052233  0.21385312  0.58166914
  0.62188071  0.72052123  0.69775508  0.2320831   0.41381769  0.16556153
  0.20199773  0.37049483  0.16240002  0.25522952]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [29 64  8 36 15 82 82 77 17 12 10 38 93 68 13 97 43 35  5 27 13 21 37 36 95
 59  8 12 86 26 73 35 17 14  7 53 35 41 47 44]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.13498227  0.88700743  0.47329674  0.39888446  0.45619738  0.29082095
  0.14021311  0.62115829  0.2965092   0.43765629  0.14485034  0.54057227
  0.57732231  0.56346277  0.95517917  0.48152365  0.66515637  0.9519581
  0.3734739   0.14507125  0.44499049  0.36388989  0.99803888  0.20780038
  0.13921404  0.70299443  0.63256569  0.14144979  0.7685012   0.87532202
  0.8967954   0.89671021  0.60272437  0.51446809  0.17310581  0.6590742
  0.53393811  0.93898402  0.72681932  0.82765027]
--------------------------------------------------------------------------------
graph grammar stats:
#instances:100   #interfaces: 151   #cores: 193   #core-interface-pairs: 560
graph grammar stats:
#instances:100   #interfaces: 208   #cores: 203   #core-interface-pairs: 775


	Iteration: 1/40 (after 61.4 sec; 0:01:01.351543)
Best score (roc_auc): 0.686 (0.831 +- 0.145)

Data:
Instances: 83 ; Features: 1048577 with an avg of 199 features
class: 1 count:37 (0.45)	class: -1 count:46 (0.55)	

	Model parameters:

Pre_processor:
    burnin: 0
complexity: 3
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 2
   n_steps: 5
        nu: 0.1
radius_list: [0, 1, 2]
random_state: 2
thickness_list: [1, 2]

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-08
      eta0: 0.01
  l1_ratio: 0.246405576265
learning_rate: invscaling
      loss: squared_hinge
    n_iter: 41
   penalty: elasticnet
   power_t: 0.398884464812


	Iteration: 1/40 (after 62.7 sec; 0:01:02.705226)
Best score (roc_auc): 0.797 (0.897 +- 0.099)

Data:
Instances: 83 ; Features: 1048577 with an avg of 199 features
class: 1 count:37 (0.45)	class: -1 count:46 (0.55)	

	Model parameters:

Pre_processor:
    burnin: 0
complexity: 3
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 2
   n_steps: 5
        nu: 0.1
radius_list: [0, 1, 2]
random_state: 2
thickness_list: [1, 2]

Vectorizer:
complexity: 3

Estimator:
     alpha: 1e-05
      eta0: 0.001
  l1_ratio: 0.21385312221
learning_rate: optimal
      loss: log
    n_iter: 95
   penalty: l2
   power_t: 0.140213110465
graph grammar stats:
#instances:100   #interfaces: 103   #cores: 92   #core-interface-pairs: 274

In [6]:
%%time
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
model = constructive_model(pos_fname, neg_fname, size=100, model_fname='bursi', n_iter=5, train_test_split=0.5)


# positives: 100  # negatives: 100 (0.2 sec 0:00:00.176622)
graph grammar stats:
#instances:50   #interfaces: 36   #cores: 53   #core-interface-pairs: 119
graph grammar stats:
#instances:50   #interfaces: 41   #cores: 53   #core-interface-pairs: 137


	Iteration: 1/5 (after 38.4 sec; 0:00:38.413056)
Best score (roc_auc): 0.729 (0.862 +- 0.133)

Data:
Instances: 187 ; Features: 1048577 with an avg of 379 features
class: 1 count:91 (0.49)	class: -1 count:96 (0.51)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: invscaling
      loss: squared_hinge
    n_iter: 36
   penalty: l2
   power_t: 0.193562877926


	Iteration: 1/5 (after 43.2 sec; 0:00:43.201064)
Best score (roc_auc): 0.746 (0.894 +- 0.148)

Data:
Instances: 187 ; Features: 1048577 with an avg of 379 features
class: 1 count:91 (0.49)	class: -1 count:96 (0.51)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-08
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: optimal
      loss: squared_hinge
    n_iter: 89
   penalty: elasticnet
   power_t: 0.712351665998
graph grammar stats:
#instances:50   #interfaces: 50   #cores: 53   #core-interface-pairs: 158
graph grammar stats:
#instances:50   #interfaces: 64   #cores: 53   #core-interface-pairs: 199


	Iteration: 2/5 (after 79.1 sec; 0:01:19.094457)
Best score (roc_auc): 0.770 (0.859 +- 0.089)

Data:
Instances: 192 ; Features: 1048577 with an avg of 354 features
class: 1 count:92 (0.48)	class: -1 count:100 (0.52)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.5
max_core_size_diff: 3
 n_samples: 4
   n_steps: 5
        nu: 0.33
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: invscaling
      loss: squared_hinge
    n_iter: 36
   penalty: l2
   power_t: 0.193562877926


	Iteration: 2/5 (after 83.9 sec; 0:01:23.925014)
Best score (roc_auc): 0.794 (0.882 +- 0.088)

Data:
Instances: 192 ; Features: 1048577 with an avg of 354 features
class: 1 count:92 (0.48)	class: -1 count:100 (0.52)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.5
max_core_size_diff: 3
 n_samples: 4
   n_steps: 5
        nu: 0.33
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-08
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: optimal
      loss: squared_hinge
    n_iter: 89
   penalty: elasticnet
   power_t: 0.712351665998


	Iteration: 2/5 (after 85.8 sec; 0:01:25.816994)
Best score (roc_auc): 0.806 (0.881 +- 0.075)

Data:
Instances: 192 ; Features: 1048577 with an avg of 354 features
class: 1 count:92 (0.48)	class: -1 count:100 (0.52)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.5
max_core_size_diff: 3
 n_samples: 4
   n_steps: 5
        nu: 0.33
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-06
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: constant
      loss: squared_hinge
    n_iter: 96
   penalty: l2
   power_t: 0.939838428481
graph grammar stats:
#instances:50   #interfaces: 50   #cores: 53   #core-interface-pairs: 158
graph grammar stats:
#instances:50   #interfaces: 64   #cores: 53   #core-interface-pairs: 199


	Iteration: 3/5 (after 137.9 sec; 0:02:17.948988)
Best score (roc_auc): 0.817 (0.915 +- 0.098)

Data:
Instances: 188 ; Features: 1048577 with an avg of 377 features
class: 1 count:94 (0.50)	class: -1 count:94 (0.50)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: optimal
      loss: squared_hinge
    n_iter: 89
   penalty: elasticnet
   power_t: 0.939838428481


	Iteration: 3/5 (after 140.6 sec; 0:02:20.606755)
Best score (roc_auc): 0.869 (0.929 +- 0.061)

Data:
Instances: 188 ; Features: 1048577 with an avg of 377 features
class: 1 count:94 (0.50)	class: -1 count:94 (0.50)	

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: constant
      loss: squared_hinge
    n_iter: 89
   penalty: l2
   power_t: 0.193562877926
graph grammar stats:
#instances:50   #interfaces: 50   #cores: 53   #core-interface-pairs: 158
graph grammar stats:
#instances:50   #interfaces: 64   #cores: 53   #core-interface-pairs: 199
graph grammar stats:
#instances:50   #interfaces: 50   #cores: 53   #core-interface-pairs: 158
graph grammar stats:
#instances:50   #interfaces: 64   #cores: 53   #core-interface-pairs: 199
Saved current best model in bursi
Parameters:

	Model parameters:

Pre_processor:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]

Vectorizer:
complexity: 4

Estimator:
     alpha: 1e-07
      eta0: 0.01
  l1_ratio: 0.698057935388
learning_rate: constant
      loss: squared_hinge
    n_iter: 89
   penalty: l2
   power_t: 0.193562877926
Test set
Instances: 100 ; Features: 1048577 with an avg of 334 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.71      0.70      0.71        50
          1       0.71      0.72      0.71        50

avg / total       0.71      0.71      0.71       100

APR: 0.773
ROC: 0.802
Cross-validated estimate
            accuracy: 0.670 +- 0.144
           precision: 0.677 +- 0.165
              recall: 0.700 +- 0.126
                  f1: 0.683 +- 0.137
   average_precision: 0.797 +- 0.104
             roc_auc: 0.780 +- 0.110
CPU times: user 28.2 s, sys: 8.75 s, total: 37 s
Wall time: 4min 10s

Explicit tuning


In [8]:
%%time
#explicit experiment
start_global = time()

#train a model on data, then test it on original data (different from the mols that generated the data) and compare 
from eden.graph import Vectorizer
vectorizer=Vectorizer(5)

#setup
size=100
pos_fname='bursi_pos_orig.smi'
neg_fname='bursi_neg_orig.smi'
iterable_pos= get_graphs(pos_fname, size=size)
iterable_neg= get_graphs(neg_fname, size=size)
random_state=42
train_test_split=.7

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

args = {'random_state':42,
        'complexity':5,
        'nu':0.1,
        'radius_list':[0,1],
        'thickness_list':[1,2],
        'n_steps':5,
        'n_samples':4,
        'burnin':1,
        'improving_threshold':0.25,
        'max_core_size_diff':0}
            
logger.info('-'*80)
logger.info('Params:')
from eden.util import serialize_dict
logger.info(serialize_dict(args))

#train
start = time()
logger.info('-'*80)
logger.info('Grammar induction:')
logger.info('Positives:')
sampled_pos = generate_sample(iterable_pos_train, **args)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))

start = time()
logger.info('Negatives:')
sampled_neg = generate_sample(iterable_neg_train, **args)
print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))

start = time()
logger.info('-'*80)
logger.info('Fitting:')
from eden.util import fit
estimator = fit(sampled_pos, 
                sampled_neg, 
                vectorizer, 
                fit_flag=False, 
                n_jobs=-1, 
                cv=10, 
                n_iter_search=5, 
                random_state=1, 
                block_size=100)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))


#test
start = time()
logger.info('-'*80)
logger.info('Testing:')
from eden.util import estimate
apr, roc = estimate(iterable_pos_test,
                    iterable_neg_test,  
                    estimator, 
                    vectorizer, 
                    block_size=100, 
                    n_jobs=-1)
logger.info('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))

logger.info('Global time elapsed: %s'%(datetime.timedelta(seconds=(time() - start_global))))


--------------------------------------------------------------------------------
Params:
    burnin: 1
complexity: 5
improving_threshold: 0.25
max_core_size_diff: 0
 n_samples: 4
   n_steps: 5
        nu: 0.1
radius_list: [0, 1]
random_state: 42
thickness_list: [1, 2]
--------------------------------------------------------------------------------
Grammar induction:
Positives:
graph grammar stats:
#instances:70   #interfaces: 72   #cores: 75   #core-interface-pairs: 241
Time elapsed: 0:00:05.828723
Negatives:
graph grammar stats:
#instances:70   #interfaces: 86   #cores: 63   #core-interface-pairs: 262
Time elapsed: 0:00:05.877782
--------------------------------------------------------------------------------
Fitting:
Time elapsed: 0:01:04.146812
--------------------------------------------------------------------------------
Testing:
Test set
Instances: 60 ; Features: 1048577 with an avg of 463 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.68      0.50      0.58        30
          1       0.61      0.77      0.68        30

avg / total       0.64      0.63      0.63        60

APR: 0.617
ROC: 0.717
Cross-validated estimate
            accuracy: 0.633 +- 0.113
           precision: 0.610 +- 0.091
              recall: 0.767 +- 0.170
                  f1: 0.672 +- 0.107
   average_precision: 0.731 +- 0.157
             roc_auc: 0.733 +- 0.144
Time elapsed: 0:00:02.400114
Global time elapsed: 0:01:18.427208
CPU times: user 8.3 s, sys: 3.01 s, total: 11.3 s
Wall time: 1min 18s

.