notebook.community

Edit and run

This notebook investigates how fast using the edge store to do simple seeding queries is compared to loading in-memory, then operating with NetworkX



In [32]:

    
from pybel import BELGraph, Manager
import sys
import pandas as pd
from pybel.struct.summary import count_functions
from pybel.manager.models import Edge, network_edge
import time
from sqlalchemy import and_
from pybel.constants import VERSION



In [2]:

    
print(sys.version)









    



3.6.3 (default, Oct  9 2017, 09:47:56) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.37)]



In [3]:

    
print(time.asctime())









    



Tue Dec 19 14:16:22 2017



In [4]:

    
print(VERSION)









    



0.9.8-dev



In [5]:

    
m = Manager()
m









    Out[5]:





<Manager connection=mysql+mysqldb://root@localhost/pybel?charset=utf8>



In [6]:

    
pd.DataFrame(
    {
        'id': n.id, 
        'name': n.name, 
        'version': n.version
    } 
    for n in m.list_recent_networks()
)









    Out[6]:







  
    
      
      id
      name
      version
    
  
  
    
      0
      10
      Alzheimer's Disease Knowledge Assembly
      4.3.7
    
    
      1
      11
      BEL Framework Large Corpus Document
      20150611
    
    
      2
      9
      CDR-SB Associations
      1.0.5
    
    
      3
      2
      NeuroMMSigDB for Alzheimer Disease
      20171205
    
    
      4
      6
      Paths from CDR-SB to Hippocampal Volumne
      1.0.1



In [ ]:

    
def print_summary(graph):
    print(*sorted(count_functions(graph_by_network_id).items()), sep='\n')

Getting a Single Graph

Using NetworkX



In [16]:

    
%%time
graph_by_network_id = m.get_graph_by_id(11)
print_summary(graph_by_network_id)









    



('Protein', 6295)
('BiologicalProcess', 195)
('Complex', 1798)
('Abundance', 593)
('Pathology', 146)
('RNA', 5821)
('Reaction', 11)
('Gene', 1002)
('Composite', 7)
('miRNA', 33)
CPU times: user 910 ms, sys: 45.9 ms, total: 956 ms
Wall time: 1.11 s

Using SQL



In [8]:

    
def get_graph_by_network_edges(manager, network_id, **kwargs):
    network = manager.get_network_by_id(network_id)
    
    edges = network.edges
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph



In [9]:

    
%%time
graph_by_edges = get_graph_by_network_edges(m, 11)
print_summary(graph_by_edges)









    



Counter({'Protein': 6295, 'RNA': 5821, 'Complex': 1798, 'Gene': 1002, 'Abundance': 593, 'BiologicalProcess': 195, 'Pathology': 146, 'miRNA': 33, 'Reaction': 11, 'Composite': 7})
CPU times: user 2min 10s, sys: 2.61 s, total: 2min 13s
Wall time: 2min 38s

This query works, but needs serious optimization to be generally useful, especially since this kind of query automatically eliminates the need to do in-memory graph join operations.

Getting Multiple Graphs

Using NetworkX



In [19]:

    
network_ids = [10, 2, 9]



In [31]:

    
%%time
graph_by_network_ids = m.get_graph_by_ids(network_ids)
print_summary(graph_by_network_ids)









    



('Abundance', 427)
('Complex', 746)
('Protein', 1757)
('Reaction', 20)
('Pathology', 128)
('BiologicalProcess', 418)
('Gene', 1023)
('RNA', 492)
('Composite', 68)
('miRNA', 47)
CPU times: user 3.03 s, sys: 56.7 ms, total: 3.09 s
Wall time: 3.11 s

Using SQL



In [29]:

    
def get_graph_by_networks_edges(manager, network_ids, **kwargs):
    edges = manager.session.query(Edge).join(network_edge).filter(network_edge.c.network_id.in_(network_ids))
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph



In [30]:

    
%%time
graph_by_networks_edges = get_graph_by_networks_edges(m, network_ids)
print_summary(graph_by_networks_edges)









    



('Gene', 1023)
('Abundance', 417)
('RNA', 483)
('Protein', 1729)
('Composite', 68)
('Complex', 743)
('Reaction', 18)
('BiologicalProcess', 412)
('Pathology', 128)
('miRNA', 47)
CPU times: user 59 s, sys: 1.17 s, total: 1min
Wall time: 1min 12s

Getting Edges Matching an Annotation



In [ ]:

    
graph =



In [ ]:

    
def get_graph_by_annotation(manager, keyword, value, **kwargs):
    edges = manager.session.query(Edge).\
        join(network_edge).join(edge_annotation).join(AnnotationEntry)\
        filter(_and(network_edge.c.network_id.in_(network_ids),
                    edge_annotation.c.annotation_id == annotation_id))
    
    graph = BELGraph(**kwargs)
    
    for edge in edges:
        edge.insert_into_graph(graph)
        
    return graph



In [ ]:

    
annotation = m.get_annotation_entry

	id	name	version
0	10	Alzheimer's Disease Knowledge Assembly	4.3.7
1	11	BEL Framework Large Corpus Document	20150611
2	9	CDR-SB Associations	1.0.5
3	2	NeuroMMSigDB for Alzheimer Disease	20171205
4	6	Paths from CDR-SB to Hippocampal Volumne	1.0.1