We learn how to generate a graph minor on chemical structure graphs

this time we use learned abstraction. which means that we try to cluster the parts that get contracted in the minor and use the cluster_id as the name of a minor vertex

initialise nb


In [1]:
%load_ext autoreload
%autoreload 2
from eden.util import configure_logging
import logging

BABELDRAW=False
DEBUG=False
NJOBS=4
if DEBUG: NJOBS=1

configure_logging(logging.getLogger(),verbosity=1+DEBUG)
from IPython.core.display import HTML
HTML('<style>.container { width:95% !important; }</style>')


Out[1]:

In [2]:
%matplotlib inline

In [3]:
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice

def get_graphs(dataset_fname='../toolsdata/bursi.pos.gspan', size=100):
    return  islice(gspan_to_eden(dataset_fname),size)

demonstration of the preprocesor learning the abstraction


In [ ]:


In [4]:
from graphlearn.utils import draw
import graphlearn.abstract_graphs.minortransform as transform
import graphlearn.abstract_graphs.minordecompose as decompose
from eden.graph import Vectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
import math
#preparing
v=Vectorizer(complexity=3)
make_decomposer = decompose.make_decomposergen(include_base=False, base_thickness_list=[2])

# nodes in all graphs get scored.
# the default functionality is to take all scores and cluster them 
# such that nodes that get assigned the same cluster can be contracted in a minor graph.
# ShapeCluster is going the lazy route and uses the score of the node directly for the clusterid 
class ShapeCluster:
    def fit(self,li):
        pass
    def predict(self,i):
        return [math.ceil(i)]

pp=transform.GraphMinorTransformer(#core_shape_cluster =KMeans(n_clusters=4),
                                   core_shape_cluster =ShapeCluster(),
                                   name_cluster       =MiniBatchKMeans(n_clusters=6), 
                                   save_graphclusters =True,
                                   shape_score_threshold=2.5,
                                   shape_min_size=2)
pp.set_param(v)

# the magic happens here
decomposers=[make_decomposer(v,x) for x in pp.fit_transform(get_graphs(size=200))]

# lets look at some clusters
if False:
    for cluster_id in pp.graphclusters:
        print('cluster id: %d  num: %d' % (cluster_id, len(pp.graphclusters[cluster_id])))
        if cluster_id != -1:
            draw.graphlearn(pp.graphclusters[cluster_id][:7], n_graphs_per_line=7, 
                            size=6, vertex_color='_label_', prog='neato', colormap='Set3',
                            contract=False,edge_label='label')


#lets draw what we did there
for i in range(3):
    draw.graphlearn([decomposers[i+5].pre_vectorizer_graph(nested=True),decomposers[i+5].base_graph(),decomposers[i+5].abstract_graph()],
                    size=10,
                    contract=True, 
                    abstract_color='red',
                    vertex_label='label',nesting_edge_alpha=0.7)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-c0aec3950f35> in <module>()
     29 
     30 # the magic happens here
---> 31 decomposers=[make_decomposer(v,x) for x in pp.fit_transform(get_graphs(size=200))]
     32 
     33 # lets look at some clusters

/home/ikea/GraphLearn/graphlearn/abstract_graphs/minortransform.py in fit_transform(self, inputs)
    308 
    309         inputs = list(inputs)
--> 310         self.fit(inputs)
    311         return self.transform(inputs)
    312 

/home/ikea/GraphLearn/graphlearn/abstract_graphs/minortransform.py in fit(self, inputs)
    230         # this k means is over the values resulting from annotation
    231         # and determine how a graph will be split intro minor nodes.
--> 232         self.rawgraph_estimator.fit(inputs, vectorizer=self.vectorizer)
    233         self.train_core_shape_cluster(inputs)
    234 

TypeError: fit() got an unexpected keyword argument 'vectorizer'

lets see if these wrappers give us CIPS as this is their only purpose.

this is not interesting.


In [ ]:


In [5]:
#parameters 
radius_list=[0,2]
thickness_list=[2,4]
base_thickness_list=[2]
#extract
cips=decomposers[0].all_core_interface_pairs(thickness_list=[2],radius_list=[0,1],hash_bitmask=2**20-1)
#draw
draw.graphlearn([cips[0][0].graph,cips[0][1].graph], contract=False)


Train sampler


In [6]:
%%time

from graphlearn.graphlearn import Sampler as graphlearn_sampler
graphs = get_graphs(size=1000)
sampler=graphlearn_sampler(radius_list=[0,1],
            thickness_list=[1], 
            min_cip_count=2, 
            min_interface_count=2, 
            decomposergen=make_decomposer,
            graphtransformer=transform.GraphMinorTransformer(
                                   core_shape_cluster =ShapeCluster(),
                                   name_cluster       =MiniBatchKMeans(n_clusters=6), 
                                   save_graphclusters =True)

sampler.fit(graphs,grammar_n_jobs=NJOBS)
print 'done'


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-230fc344c594> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"\nfrom graphlearn.graphlearn import Sampler as graphlearn_sampler\ngraphs = get_graphs(size=1000)\nsampler=graphlearn_sampler(radius_list=[0,1],\n            thickness_list=[1], \n            min_cip_count=2, \n            min_interface_count=2, \n            decomposergen=make_decomposer,\n            graphtransformer=transform.GraphMinorTransformer(\n                                   core_shape_cluster =ShapeCluster(),\n                                   name_cluster       =MiniBatchKMeans(n_clusters=6), \n                                   save_graphclusters =True,\n                                   graph_to_minor     =abstractor))\n\nsampler.fit(graphs,grammar_n_jobs=NJOBS)\nprint 'done'\n")

/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

<decorator-gen-60> in time(self, line, cell, local_ns)

/usr/local/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

NameError: name 'abstractor' is not defined

Inspect the induced grammar


In [ ]:
draw.draw_grammar(sampler.lsgg.productions,n_productions=5,n_graphs_per_production=5,
                     n_graphs_per_line=5, size=9, contract=False,
                     colormap='Paired', invert_colormap=False,node_border=1,
                     vertex_alpha=0.6, edge_alpha=0.5, node_size=450, abstract_interface=True)

sample molecules


In [ ]:
%%time
import graphlearn.utils.draw as draw
import itertools

#parameters
graphs = get_graphs()
id_start=15
id_end=id_start+9
graphs = itertools.islice(graphs,id_start,id_end)
n_steps=50


# sampling with many arguments.
graphs = sampler.sample(graphs,
                        n_samples=5,
                        batch_size=1,
                        n_steps=n_steps,
                        n_jobs=1,
                        quick_skip_orig_cip=False,
                        probabilistic_core_choice=True,
                        burnin=0,
                        improving_threshold=0.5,
                        select_cip_max_tries=100,
                        keep_duplicates=True,
                        include_seed=True)

 
    
scores=[]
ids=range(id_start,id_end)
for i,path_graphs in enumerate(graphs):
    # for each sampling path:
    print 'Graph id: %d'%(ids[i])
    
    #collect scores so that we can display the score graph later 
    scores.append(sampler.monitors[i].sampling_info['score_history'])
    
    # show graphs
    if not BABELDRAW:
        draw.graphlearn(path_graphs,
                        n_graphs_per_line=5, size=10, 
                       colormap='Paired', invert_colormap=False,node_border=0.5, vertex_color='_label_',
                        vertex_alpha=0.5, edge_alpha=0.7, node_size=450)
    else:
        from graphlearn.utils import openbabel
        openbabel.draw(path_graphs)

plot score graph


In [ ]:
%matplotlib inline
from itertools import islice
import numpy as np
import matplotlib.pyplot as plt

step=1
num_graphs_per_plot=3
num_plots=np.ceil([len(scores)/num_graphs_per_plot])
for i in range(num_plots):
    plt.figure(figsize=(10,5))
    for j,score in enumerate(scores[i*num_graphs_per_plot:i*num_graphs_per_plot+num_graphs_per_plot]):
        data = list(islice(score,None, None, step))
        plt.plot(data, label='graph %d'%(j+i*num_graphs_per_plot+id_start))
    plt.legend(loc='lower right')
    plt.grid()
    plt.ylim(-0.1,1.1)
    plt.show()