This notebook investigates how fast using the edge store to do simple seeding queries is compared to loading in-memory, then operating with NetworkX


In [32]:
from pybel import BELGraph, Manager
import sys
import pandas as pd
from pybel.struct.summary import count_functions
from pybel.manager.models import Edge, network_edge
import time
from sqlalchemy import and_
from pybel.constants import VERSION

In [2]:
print(sys.version)


3.6.3 (default, Oct  9 2017, 09:47:56) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.37)]

In [3]:
print(time.asctime())


Tue Dec 19 14:16:22 2017

In [4]:
print(VERSION)


0.9.8-dev

In [5]:
m = Manager()
m


Out[5]:
<Manager connection=mysql+mysqldb://root@localhost/pybel?charset=utf8>

In [6]:
pd.DataFrame(
    {
        'id': n.id, 
        'name': n.name, 
        'version': n.version
    } 
    for n in m.list_recent_networks()
)


Out[6]:
id name version
0 10 Alzheimer's Disease Knowledge Assembly 4.3.7
1 11 BEL Framework Large Corpus Document 20150611
2 9 CDR-SB Associations 1.0.5
3 2 NeuroMMSigDB for Alzheimer Disease 20171205
4 6 Paths from CDR-SB to Hippocampal Volumne 1.0.1

In [ ]:
def print_summary(graph):
    print(*sorted(count_functions(graph_by_network_id).items()), sep='\n')

Getting a Single Graph

Using NetworkX


In [16]:
%%time
graph_by_network_id = m.get_graph_by_id(11)
print_summary(graph_by_network_id)


('Protein', 6295)
('BiologicalProcess', 195)
('Complex', 1798)
('Abundance', 593)
('Pathology', 146)
('RNA', 5821)
('Reaction', 11)
('Gene', 1002)
('Composite', 7)
('miRNA', 33)
CPU times: user 910 ms, sys: 45.9 ms, total: 956 ms
Wall time: 1.11 s

Using SQL


In [8]:
def get_graph_by_network_edges(manager, network_id, **kwargs):
    network = manager.get_network_by_id(network_id)
    
    edges = network.edges
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph

In [9]:
%%time
graph_by_edges = get_graph_by_network_edges(m, 11)
print_summary(graph_by_edges)


Counter({'Protein': 6295, 'RNA': 5821, 'Complex': 1798, 'Gene': 1002, 'Abundance': 593, 'BiologicalProcess': 195, 'Pathology': 146, 'miRNA': 33, 'Reaction': 11, 'Composite': 7})
CPU times: user 2min 10s, sys: 2.61 s, total: 2min 13s
Wall time: 2min 38s

This query works, but needs serious optimization to be generally useful, especially since this kind of query automatically eliminates the need to do in-memory graph join operations.

Getting Multiple Graphs

Using NetworkX


In [19]:
network_ids = [10, 2, 9]

In [31]:
%%time
graph_by_network_ids = m.get_graph_by_ids(network_ids)
print_summary(graph_by_network_ids)


('Abundance', 427)
('Complex', 746)
('Protein', 1757)
('Reaction', 20)
('Pathology', 128)
('BiologicalProcess', 418)
('Gene', 1023)
('RNA', 492)
('Composite', 68)
('miRNA', 47)
CPU times: user 3.03 s, sys: 56.7 ms, total: 3.09 s
Wall time: 3.11 s

Using SQL


In [29]:
def get_graph_by_networks_edges(manager, network_ids, **kwargs):
    edges = manager.session.query(Edge).join(network_edge).filter(network_edge.c.network_id.in_(network_ids))
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph

In [30]:
%%time
graph_by_networks_edges = get_graph_by_networks_edges(m, network_ids)
print_summary(graph_by_networks_edges)


('Gene', 1023)
('Abundance', 417)
('RNA', 483)
('Protein', 1729)
('Composite', 68)
('Complex', 743)
('Reaction', 18)
('BiologicalProcess', 412)
('Pathology', 128)
('miRNA', 47)
CPU times: user 59 s, sys: 1.17 s, total: 1min
Wall time: 1min 12s

Getting Edges Matching an Annotation


In [ ]:
graph =

In [ ]:
def get_graph_by_annotation(manager, keyword, value, **kwargs):
    edges = manager.session.query(Edge).\
        join(network_edge).join(edge_annotation).join(AnnotationEntry)\
        filter(_and(network_edge.c.network_id.in_(network_ids),
                    edge_annotation.c.annotation_id == annotation_id))
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph

In [ ]:
annotation = m.get_annotation_entry