In [86]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [87]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path
from itertools import tee
from numpy.random import randint
from numpy.random import uniform
from eden.graph import Vectorizer
from sklearn.linear_model import SGDClassifier
import datetime, time
from eden.util import random_bipartition_iter
from eden.model import ActiveLearningBinaryClassificationModel

from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

This is where the data sets are defined:


In [89]:
AID = 602325
#AID=2401
DATA_DIR = '/home/liconj/proj/thesis/EDeN/examples/model_comparison/data'
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID

Functions for training and testing the model


In [106]:
model_fname = DATA_DIR + '/AID%s.model3d'%AID
model_type = "3d"
n_conf = 10
n_iter = 10
active_set_size = 5
n_active_learning_iterations = 0
threshold = 1
train_test_split = 0.8

pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(3, 10, size=n_iter),
                          'n_conf':[n_conf]}

def pre_processor(data, model_type="3d", **kwargs):
    # model_type = kwargs.get('mode', 'default')
    if model_type == "default":
        iterable = obabel.obabel_to_eden(data, **kwargs)
    elif model_type == "3d":
        iterable = obabel.obabel_to_eden3d(data, **kwargs)
    return iterable



vectorizer = Vectorizer()
estimator = SGDClassifier(class_weight='auto', shuffle=True)

# Make predictive model
model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                estimator=estimator,
                                                vectorizer=vectorizer,
                                                n_jobs = 1,
                                                n_blocks = 2,
                                                fit_vectorizer=True)

In [127]:
########
# Create iterables from files
########

iterable_pos = obabel.make_iterable('AID720577_active.sdf', 'sdf')
iterable_neg = obabel.make_iterable('AID720577_inactive.sdf', 'sdf')
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

start = time.time()
print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))


iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

# Split train/test
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)


# positives: 80  # negatives: 146 (0.1 sec 0:00:00.057244)

In [128]:
########
# Create iterables with the pre_processor
########

pos_graphs = pre_processor(iterable_pos_train, "3d")
neg_graphs = pre_processor(iterable_neg_train, "3d")

########
# Fit vectorizer
########
from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=1, n=3)

Xp = vectorizer.fit_transform(pos_graphs)
Xn= vectorizer.fit_transform(neg_graphs)

import numpy as np
yp = [1] * Xp.shape[0]
yn = [-1] * Xn.shape[0]
y = np.array(yp + yn)
from scipy.sparse import vstack
X = vstack([Xp,Xn], format="csr")

#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='auto', shuffle=True, n_jobs=-1)

from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')

import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))


AUC ROC: 0.9955 +- 0.0097

In [111]:
%%time
# Optimize hyperparameters and fit model
# Since this model is fitted much more slowly, use a single vectorizer
#vectorizer_parameters={'complexity':[2,3,4],
#                       'discretization_size':randint(2, 3,size=n_iter),
#                       'discretization_dimension':randint(2, 3,size=n_iter)}




estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter),
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,-2)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"]}

model.optimize(iterable_pos_train, iterable_neg_train,
               model_name=model_fname,
               n_active_learning_iterations=0,
               size_positive=-1,
               size_negative=active_set_size,
               n_iter=n_iter, cv=3,
               pre_processor_parameters=pre_processor_parameters,
               vectorizer_parameters=vectorizer_parameters,
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
         k: [2 4 8 7 5 4 2 1 8 7]
    n_conf: [10]
 threshold: [6 5 8 7 5 3 4 7 4 3]

Vectorizer:
complexity: [1]
         n: 3

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.14382132  0.89052985  0.88810926  0.81740331  0.4980864   0.73702012
  0.87434349  0.7683658   0.72780085  0.54732058]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [58 56 13 48 17 20 29 99 73 64]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.79322316  0.74672044  0.21021533  0.92491762  0.14952556  0.64267232
  0.29765783  0.83633371  0.11844824  0.94797485]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-111-36ac14894a69> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'# Optimize hyperparameters and fit model\n# Since this model is fitted much more slowly, use a single vectorizer\n#vectorizer_parameters={\'complexity\':[2,3,4],\n#                       \'discretization_size\':randint(2, 3,size=n_iter),\n#                       \'discretization_dimension\':randint(2, 3,size=n_iter)}\n\n\n\n\nestimator_parameters={\'n_iter\':randint(5, 100, size=n_iter),\n                      \'penalty\':[\'l1\',\'l2\',\'elasticnet\'],\n                      \'l1_ratio\':uniform(0.1,0.9, size=n_iter),\n                      \'loss\':[\'hinge\', \'log\', \'modified_huber\', \'squared_hinge\', \'perceptron\'],\n                      \'power_t\':uniform(0.1, size=n_iter),\n                      \'alpha\': [10**x for x in range(-8,-2)],\n                      \'eta0\': [10**x for x in range(-4,-1)],\n                      \'learning_rate\': ["invscaling", "constant", "optimal"]}\n\nmodel.optimize(iterable_pos_train, iterable_neg_train,\n               model_name=model_fname,\n               n_active_learning_iterations=0,\n               size_positive=-1,\n               size_negative=active_set_size,\n               n_iter=n_iter, cv=3,\n               pre_processor_parameters=pre_processor_parameters,\n               vectorizer_parameters=vectorizer_parameters,\n               estimator_parameters=estimator_parameters)')

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2262             magic_arg_s = self.var_expand(line, stack_depth)
   2263             with self.builtin_trap:
-> 2264                 result = fn(magic_arg_s, cell)
   2265             return result
   2266 

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/liconj/.local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

/home/liconj/proj/thesis/EDeN/eden/model.pyc in optimize(self, iterable_pos, iterable_neg, model_name, n_active_learning_iterations, size_positive, size_negative, lower_bound_threshold_positive, upper_bound_threshold_positive, lower_bound_threshold_negative, upper_bound_threshold_negative, n_iter, n_inner_iter_estimator, max_total_time, pre_processor_parameters, vectorizer_parameters, estimator_parameters, cv, scoring, score_func, two_steps_optimization)
    187                 mean_len_vectorizer_parameters = 0
    188             else:
--> 189                 mean_len_vectorizer_parameters = np.mean([len(vectorizer_parameters[p]) for p in vectorizer_parameters])
    190             if (mean_len_pre_processor_parameters == 1 or mean_len_pre_processor_parameters == 0) and (mean_len_vectorizer_parameters == 1 or mean_len_vectorizer_parameters == 0):
    191                 data_matrix_is_stable = True

TypeError: object of type 'int' has no len()

In [ ]:
%%time
# Estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test )

In [ ]:
def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)

    predictions= model.decision_function( iterable )

    return predictions