Graph format

The EDeN library allows the vectorization of graphs, i.e. the transformation of graphs into sparse vectors.

The graphs that can be processed by the EDeN library have the following restrictions:

  • the graphs are implemented as networkx graphs
  • nodes and edges have identifiers: the following identifiers are used as reserved words

    1. label
    2. weight
    3. entity
    4. nesting
  • nodes and edges must have the 'label' attribute

  • the 'label' attribute can be of one of the following types:

    1. string
    2. vector
    3. dictionary

    strings are used to represent categorical values; dictionaries are used to represent sparse vectors: keys are of string type and values are of type float

  • nodes and edges can have a 'weight' attribute of type float
  • nodes can have a 'entity' attribute of type string
  • nesting edges must have a 'nesting' attribute of type boolean set to True

In [1]:
%matplotlib inline
import pylab as plt
import networkx as nx

In [2]:
G=nx.Graph()
G.add_node(0, label='A')
G.add_node(1, label='B')
G.add_node(2, label='C')

G.add_edge(0,1, label='x')
G.add_edge(1,2, label='y')
G.add_edge(2,0, label='z')

In [3]:
from eden.util import display
print display.serialize_graph(G)


{
    "directed":false,
    "graph":[],
    "nodes":[
        {
            "id":0,
            "label":"A"
        },
        {
            "id":1,
            "label":"B"
        },
        {
            "id":2,
            "label":"C"
        }
    ],
    "links":[
        {
            "source":0,
            "target":1,
            "label":"x"
        },
        {
            "source":0,
            "target":2,
            "label":"z"
        },
        {
            "source":1,
            "target":2,
            "label":"y"
        }
    ],
    "multigraph":false
}

In [4]:
from eden.util import display
display.draw_graph(G,  size=15, node_size=1500, font_size=24, node_border=True, size_x_to_y_ratio=3)



In [5]:
G=nx.Graph()
G.add_node(0, label=[0,0,.1])
G.add_node(1, label=[0,.1,0])
G.add_node(2, label=[.1,0,0])

G.add_edge(0,1, label='x')
G.add_edge(1,2, label='y')
G.add_edge(2,0, label='z')

In [6]:
display.draw_graph(G, size=15, node_size=1500, font_size=24, node_border=True, size_x_to_y_ratio=3)



In [7]:
G=nx.Graph()
G.add_node(0, label={'A':1, 'B':2, 'C':3})
G.add_node(1, label={'A':1, 'B':2, 'D':3})
G.add_node(2, label={'A':1, 'D':2, 'E':3})

G.add_edge(0,1, label='x')
G.add_edge(1,2, label='y')
G.add_edge(2,0, label='z')

In [8]:
display.draw_graph(G, size=15, node_size=1500, font_size=24, node_border=True, size_x_to_y_ratio=3)



In [9]:
G=nx.Graph()
G.add_node(0, label='A')
G.add_node(1, label='B')
G.add_node(2, label='C')
G.add_node(3, label='D')
G.add_node(4, label='E')
G.add_node(5, label='F')

G.add_edge(0,1, label='x')
G.add_edge(0,2, label='y')
G.add_edge(1,3, label='z', nesting=True, weight=.5)
G.add_edge(0,3, label='z', nesting=True, weight=.1)
G.add_edge(2,3, label='z', nesting=True, weight=.01)
G.add_edge(3,4, label='k')
G.add_edge(3,5, label='j')

In [10]:
display.draw_graph(G, size=15, node_size=1500, font_size=24, node_border=True, size_x_to_y_ratio=3, prog='circo')



In [11]:
from eden.graph import Vectorizer
X=Vectorizer(2).transform_single(G)
from eden.util import describe
print describe(X)
print X


Instances: 1 ; Features: 1048577 with an avg of 40 features per instance
  (0, 11020)	0.119522860933
  (0, 14040)	0.153392997769
  (0, 63202)	0.0597614304667
  (0, 77252)	0.0597614304667
  (0, 92565)	0.129099444874
  (0, 209851)	0.22360679775
  (0, 217622)	0.0845154254729
  (0, 218114)	0.0845154254729
  (0, 220167)	0.22360679775
  (0, 225558)	0.158113883008
  (0, 253882)	0.1792842914
  (0, 271316)	0.230089496654
  (0, 306648)	0.1
  (0, 340437)	0.1
  (0, 399061)	0.153392997769
  (0, 411680)	0.1792842914
  (0, 441622)	0.158113883008
  (0, 535314)	0.182574185835
  (0, 554828)	0.0845154254729
  (0, 557516)	0.0845154254729
  (0, 559850)	0.0845154254729
  (0, 588439)	0.129099444874
  (0, 605660)	0.129099444874
  (0, 616034)	0.119522860933
  (0, 619593)	0.129099444874
  (0, 627940)	0.169030850946
  (0, 635247)	0.182574185835
  (0, 703110)	0.0845154254729
  (0, 717898)	0.22360679775
  (0, 718505)	0.182574185835
  (0, 736760)	0.158113883008
  (0, 760739)	0.22360679775
  (0, 761988)	0.22360679775
  (0, 768731)	0.2
  (0, 806450)	0.2
  (0, 858080)	0.158113883008
  (0, 961261)	0.129099444874
  (0, 1001844)	0.169030850946
  (0, 1016151)	0.129099444874
  (0, 1034087)	0.22360679775

In [12]:
G=nx.Graph()
G.add_node(0, label='A')
G.add_node(1, label='B')
G.add_node(2, label='C')
G.add_node(3, label='D')
G.add_node(4, label='E')
G.add_node(5, label='F')

G.add_edge(0,1, label='x')
G.add_edge(0,2, label='y')
G.add_edge(1,3, label='z', nesting=True)
G.add_edge(0,3, label='z', nesting=True)
G.add_edge(2,3, label='z', nesting=True)
G.add_edge(3,4, label='k')
G.add_edge(3,5, label='j')

from eden.graph import Vectorizer
X=Vectorizer(2).transform_single(G)
from eden.util import describe
print describe(X)
print X


Instances: 1 ; Features: 1048577 with an avg of 40 features per instance
  (0, 11020)	0.119522860933
  (0, 14040)	0.153392997769
  (0, 63202)	0.0597614304667
  (0, 77252)	0.0597614304667
  (0, 92565)	0.129099444874
  (0, 209851)	0.22360679775
  (0, 217622)	0.0845154254729
  (0, 218114)	0.0845154254729
  (0, 220167)	0.22360679775
  (0, 225558)	0.158113883008
  (0, 253882)	0.1792842914
  (0, 271316)	0.230089496654
  (0, 306648)	0.1
  (0, 340437)	0.1
  (0, 399061)	0.153392997769
  (0, 411680)	0.1792842914
  (0, 441622)	0.158113883008
  (0, 535314)	0.182574185835
  (0, 554828)	0.0845154254729
  (0, 557516)	0.0845154254729
  (0, 559850)	0.0845154254729
  (0, 588439)	0.129099444874
  (0, 605660)	0.129099444874
  (0, 616034)	0.119522860933
  (0, 619593)	0.129099444874
  (0, 627940)	0.169030850946
  (0, 635247)	0.182574185835
  (0, 703110)	0.0845154254729
  (0, 717898)	0.22360679775
  (0, 718505)	0.182574185835
  (0, 736760)	0.158113883008
  (0, 760739)	0.22360679775
  (0, 761988)	0.22360679775
  (0, 768731)	0.2
  (0, 806450)	0.2
  (0, 858080)	0.158113883008
  (0, 961261)	0.129099444874
  (0, 1001844)	0.169030850946
  (0, 1016151)	0.129099444874
  (0, 1034087)	0.22360679775

Build graphs and then display them


In [13]:
import networkx as nx
graph_list = []

In [14]:
G=nx.Graph()
G.add_node(0, label='A', entity='CATEG')
G.add_node(1, label='B', entity='CATEG')
G.add_node(2, label='C', entity='CATEG')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label='A', entity='CATEG')
G.add_node(1, label='B', entity='CATEG')
G.add_node(2, label='X', entity='CATEG')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label='A', entity='CATEG')
G.add_node(1, label='B', entity='CATEG')
G.add_node(2, label='X', entity='CATEG')
G.add_edge(0,1, label='x', entity='CATEG_EDGE')
G.add_edge(1,2, label='x', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label='X', entity='CATEG')
G.add_node(1, label='X', entity='CATEG')
G.add_node(2, label='X', entity='CATEG')
G.add_edge(0,1, label='x', entity='CATEG_EDGE')
G.add_edge(1,2, label='x', entity='CATEG_EDGE')
graph_list += [G.copy()]

In [15]:
G=nx.Graph()
G.add_node(0, label=[1,0,0], entity='VEC')
G.add_node(1, label=[0,1,0], entity='VEC')
G.add_node(2, label=[0,0,1], entity='VEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label=[1,1,0], entity='VEC')
G.add_node(1, label=[0,1,1], entity='VEC')
G.add_node(2, label=[0,0,1], entity='VEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label=[1,0.1,0.2], entity='VEC')
G.add_node(1, label=[0.3,1,0.4], entity='VEC')
G.add_node(2, label=[0.5,0.6,1], entity='VEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label=[0.1,0.2,0.3], entity='VEC')
G.add_node(1, label=[0.4,0.5,0.6], entity='VEC')
G.add_node(2, label=[0.7,0.8,0.9], entity='VEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

In [16]:
G=nx.Graph()
G.add_node(0, label={'A':1, 'B':1, 'C':1}, entity='SPVEC')
G.add_node(1, label={'a':1, 'B':1, 'C':1}, entity='SPVEC')
G.add_node(2, label={'a':1, 'b':1, 'C':1}, entity='SPVEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label={'A':1,  'C':1, 'D':1}, entity='SPVEC')
G.add_node(1, label={'a':1,  'C':1, 'D':1}, entity='SPVEC')
G.add_node(2, label={'a':1,  'C':1, 'D':1}, entity='SPVEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label={'A':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_node(1, label={'a':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_node(2, label={'a':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

G=nx.Graph()
G.add_node(0, label={'A':1, 'B':1, 'C':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_node(1, label={'a':1, 'B':1, 'C':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_node(2, label={'a':1, 'b':1, 'C':1, 'D':1, 'E':1}, entity='SPVEC')
G.add_edge(0,1, label='a', entity='CATEG_EDGE')
G.add_edge(1,2, label='b', entity='CATEG_EDGE')
graph_list += [G.copy()]

In [17]:
from eden.util import display
for g in graph_list:
    display.draw_graph(g, size=5, node_size=800, node_border=1, layout='shell', secondary_vertex_label = 'entity')


Create a vector representation


In [18]:
%%time
from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=2, n=4)
vectorizer.fit(graph_list)
X = vectorizer.transform(graph_list)
y=[1]*4+[2]*4+[3]*4
print 'Instances: %d \nFeatures: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])


Instances: 12 
Features: 1048577 with an avg of 21 features per instance
CPU times: user 1.25 s, sys: 173 ms, total: 1.42 s
Wall time: 1.42 s
/Library/Python/2.7/site-packages/sklearn/cluster/k_means_.py:1433: RuntimeWarning: Got data type int64, converted to float to avoid overflows
  X = self._check_test_data(X)

In [23]:
opts={'knn': 3, 'metric': 'rbf', 'k_threshold': 0.7, 'gamma': 1e-2}
from eden.embedding import display_embedding, embedding_quality
print 'Embedding quality [adjusted Rand index]: %.2f    data: %s   #classes: %d' % (embedding_quality(X, y, opts), X.shape, len(set(y)))
display_embedding(X,y, opts)


Embedding quality [adjusted Rand index]: 1.00    data: (12, 1048577)   #classes: 3

Compute pairwise similarity matrix


In [20]:
from ipy_table import * 
def prep_table(K):    
    header = [' ']
    header += [i for i in range(K.shape[0])]
    mat = [header]
    for id, row in enumerate(K):
        new_row = [id]
        new_row += list(row)
        mat.append(new_row)
    return mat

In [21]:
from sklearn import metrics

K=metrics.pairwise.pairwise_kernels(X, metric='linear')

mat=prep_table(K)
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%0.2f')


Out[21]:
 01234567891011
01.000.190.150.000.000.000.000.000.000.000.000.00
10.191.000.380.070.000.000.000.000.000.000.000.00
20.150.381.000.070.000.000.000.000.000.000.000.00
30.000.070.071.000.000.000.000.000.000.000.000.00
40.000.000.000.001.000.550.280.260.000.000.000.00
50.000.000.000.000.551.000.280.300.000.000.000.00
60.000.000.000.000.280.281.000.300.000.000.000.00
70.000.000.000.000.260.300.301.000.000.000.000.00
80.000.000.000.000.000.000.000.001.000.640.060.26
90.000.000.000.000.000.000.000.000.641.000.140.64
100.000.000.000.000.000.000.000.000.060.141.000.16
110.000.000.000.000.000.000.000.000.260.640.161.00

In [ ]: