Consider a binary classification problem. We will fit a predictor and use it to assign a weight score to each node in each instance; this operation is referred to as "annotation". For illustration purposes we will display a few annotated graphs. We will see that building a predictor on the annotated instances can increase the predictive performance.
load data and convert it to graphs
In [1]:
pos = 'bursi.pos.gspan'
neg = 'bursi.neg.gspan'
from eden.converter.graph.gspan import gspan_to_eden
iterable_pos = gspan_to_eden( pos )
iterable_neg = gspan_to_eden( neg )
#split train/test
train_test_split=0.9
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)
setup the vectorizer
In [12]:
from eden.graph import Vectorizer
vectorizer = Vectorizer( complexity=2 )
In [13]:
%%time
from itertools import tee
iterable_pos_train,iterable_pos_train_=tee(iterable_pos_train)
iterable_neg_train,iterable_neg_train_=tee(iterable_neg_train)
iterable_pos_test,iterable_pos_test_=tee(iterable_pos_test)
iterable_neg_test,iterable_neg_test_=tee(iterable_neg_test)
from eden.util import fit,estimate
estimator = fit(iterable_pos_train_, iterable_neg_train_, vectorizer, n_iter_search=5)
estimate(iterable_pos_test_, iterable_neg_test_, estimator, vectorizer)
annotate instances and list all resulting graphs
display one graph as an example. Color the vertices using the annotated 'importance' attribute.
In [14]:
help(vectorizer.annotate)
In [15]:
%matplotlib inline
from itertools import tee
iterable_pos_train,iterable_pos_train_=tee(iterable_pos_train)
graphs = vectorizer.annotate( iterable_pos_train_, estimator=estimator )
import itertools
graphs = itertools.islice( graphs, 3 )
from eden.util.display import draw_graph
for graph in graphs: draw_graph( graph, vertex_color='importance', size=10 )
In [16]:
%matplotlib inline
from itertools import tee
iterable_pos_train,iterable_pos_train_=tee(iterable_pos_train)
graphs = vectorizer.annotate( iterable_pos_train_, estimator=estimator )
from eden.modifier.graph.vertex_attributes import colorize_binary
graphs = colorize_binary(graph_list = graphs, output_attribute = 'color_value', input_attribute='importance', level=0)
import itertools
graphs = itertools.islice( graphs, 3 )
from eden.util.display import draw_graph
for graph in graphs: draw_graph( graph, vertex_color='color_value', size=10 )
Create a data matrix this time using the annotated graphs. Note that now graphs are weighted.
Evaluate the predictive performance on the weighted graphs.
In [17]:
%%time
a_estimator=estimator
num_iterations = 3
reweight = 0.6
for i in range(num_iterations):
print 'Iteration %d'%i
from itertools import tee
iterable_pos_train_=vectorizer.annotate( iterable_pos_train, estimator=a_estimator, reweight=reweight )
iterable_neg_train_=vectorizer.annotate( iterable_neg_train, estimator=a_estimator, reweight=reweight )
iterable_pos_test_=vectorizer.annotate( iterable_pos_test, estimator=a_estimator, reweight=reweight )
iterable_neg_test_=vectorizer.annotate( iterable_neg_test, estimator=a_estimator, reweight=reweight )
iterable_pos_train,iterable_pos_train_=tee(iterable_pos_train_)
iterable_neg_train,iterable_neg_train_=tee(iterable_neg_train_)
iterable_pos_test,iterable_pos_test_=tee(iterable_pos_test_)
iterable_neg_test,iterable_neg_test_=tee(iterable_neg_test_)
from eden.util import fit,estimate
a_estimator = fit(iterable_pos_train_, iterable_neg_train_, vectorizer)
estimate(iterable_pos_test_, iterable_neg_test_, a_estimator, vectorizer)