circle abstraction for chemical structure graphs

In this notebook I show how to use Graphlearn with abstract graphs to generate molecules with an abstraction that deals with circular motives.


In [1]:
%load_ext autoreload
%autoreload 2
from eden.util import configure_logging
import logging

%matplotlib inline

get_graphs will provide data( networkx graphs )

In [2]:
from import gspan_to_eden
from itertools import islice

def get_graphs(dataset_fname='../../toolsdata/bursi.pos.gspan', size=100):
    return  islice(gspan_to_eden(dataset_fname),size)

I demonstrate what an abstract graph looks like

In [3]:
from graphlearn.utils import draw
import graphlearn.minor.molecule.transform_cycle as mole
import graphlearn.minor.decompose as decompose
from graphlearn.graphlearn import Sampler as GLS
from eden.graph import Vectorizer

# get a graph and prepare it

# why do we need a decomposer? we dont do decomp here..
#decomposer = decompose.MinorDecomposer(                                            
#                       include_base=False,
#                       base_thickness_list=[2])

#the preprocessor makes the abstraction, wrapper provides convenient format for drawing


#graph_wrapper = decomposer.make_new_decomposer(g)

graph= preproc.abstract(g)

# draw 
draw.graphlearn(graph,size=10, abstract_color='red', contract=True,ignore_for_layout='nesting')

Here we see different views on a graph, provided by the wrapper,

for the sampling process CIPs need to be extracted. here we see this mechanism.

In [4]:
from graphlearn.utils import draw
from graphlearn.graphlearn import Sampler as GLS
from eden.graph import Vectorizer

import graphlearn.minor.molecule.transform_cycle as mole
import graphlearn.minor.decompose as decompose

decomposer = decompose.MinorDecomposer()


for i in range(1):
    print 'grammar example %d' % i
    print g.graph
    gm=decomposer.make_new_decomposer( transformer.transform([g])[0])
    draw.graphlearn([gm.pre_vectorizer_graph(nested=True), gm.abstract_graph(),gm.base_graph()], 
                    size = 15,
                    vertex_label = 'label',

argz=(gm,radius_list,thickness_list,Vectorizer(),2**20-1,lambda x,y:True, base_thickness_list)

draw.graphlearn(cips[0][0].graph, contract=False)
draw.graphlearn(cips[0][1].graph, contract=False)

grammar example 0
{'id': 't # id 1 target: 1\n'}
here we prepare a sampler and then take a closer look at the generated grammar.

In [5]:
from graphlearn.graphlearn import Sampler as GLS
graphs = get_graphs(size=200)

accept_min_similarity: 0.0
accept_static_penalty: 0.0
batch_size: 10
    burnin: 0
core_choice_bytrial: False
core_choice_bytrial_multiplier: 1.0
decomposer: no graphs in decomposer
estimatorobject: <graphlearn.estimate.OneClassEstimator instance at 0x7f00bbb74758>
feasibility_checker: <graphlearn.feasibility.FeasibilityChecker instance at 0x7f00bbb74b48>
graph_transformer: <graphlearn.minor.molecule.transform_cycle.GraphTransformerCircles object at 0x7f00b99cc390>
improving_linear_start_absolute: 0
improving_linear_start_fraction: 0
improving_penalty_per_step: 0.02
improving_threshold_absolute: 50
improving_threshold_fraction: -1
include_seed: False
keep_duplicates: False
      lsgg: <graphlearn.localsubstitutablegraphgrammar.LocalSubstitutableGraphGrammar object at 0x7f00b9cf3690>
maxbacktrack: 0
   monitor: False
  monitors: []
    n_jobs: 0
 n_samples: None
   n_steps: 50
orig_cip_max_positives: 1
orig_cip_min_positives: 0
orig_cip_score_tricks: False
probabilistic_core_choice: True
proposal_probability: False
quick_skip_orig_cip: False
random_state: None
sampling_interval: 9999
score_core_choice: False
select_cip_max_tries: 20
similarity: -1
size_constrained_core_choice: -1
size_diff_core_filter: -1
vectorizer:               __version__: 1.0.1                   bitmask: 65535                block_size: 100                complexity: 3                         d: 3                  discrete: True              feature_size: 65537       inner_normalization: True                 key_class: class            key_importance: importance                 key_label: label               key_nesting: nesting                  key_svec: svec                   key_vec: vec                key_weight: weight                     min_d: 0                     min_r: 0                    n_jobs: 1                      name: Vectorizer                     nbits: 16             normalization: True                positional: False                         r: 3              weights_dict: None
#instances: 200  #interfaces: 82   #cores: 66   #core-interface-pairs: 314

SGDClassifier(alpha=0.000187567852834, average=False, class_weight=None,
       epsilon=0.1, eta0=0.363636782851, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=82, n_jobs=1,
       penalty='l1', power_t=0.299025634434, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Predictive performance:
            accuracy: 1.000 +- 0.000
           precision: 1.000 +- 0.000
              recall: 1.000 +- 0.000
                  f1: 1.000 +- 0.000
   average_precision: 1.000 +- 0.000
             roc_auc: 1.000 +- 0.000
CPU times: user 9.25 s, sys: 800 ms, total: 10 s
Wall time: 20.9 s

In [6]:
#show congruent cips in grammar
                     n_graphs_per_line=5, size=9, contract=False,
                     colormap='Paired', invert_colormap=False,
                     vertex_alpha=0.6, edge_alpha=0.5,  abstract_interface=True)

interface id: 8188 [23 options]
interface id: 610744 [16 options]
interface id: 555600 [15 options]
interface id: 161292 [15 options]
interface id: 735276 [14 options]

Molecule sampling

In [7]:
import graphlearn.utils.draw as draw
import itertools

# parameters
graphs = get_graphs()
graphs = itertools.islice(graphs,id_start,id_end)

graphs = sampler.transform(graphs)

for i,path_graphs in enumerate(graphs):
    # path_graphs is a list of 'sampled' graphs for 1 run 
    print 'Graph id: %d'%(ids[i])
    #save score
    # draw
                           n_graphs_per_line=5, size=10, 
                           colormap='Paired', invert_colormap=False, vertex_color='color_level',
                           vertex_alpha=0.5, edge_alpha=0.7,edge_label='label'  )

show score history for each samplerun

In [8]:
%matplotlib inline
from itertools import islice
import numpy as np
import matplotlib.pyplot as plt
for i in range(num_plots):
    for j,score in enumerate(scores[i*num_graphs_per_plot:i*num_graphs_per_plot+num_graphs_per_plot]):
        data = list(islice(score,None, None, step))
        plt.plot(data, label='graph %d'%(j+i*num_graphs_per_plot+id_start))
    plt.legend(loc='lower right')

