Subnetwork Generator for GO



In [3]:

    
import pandas as pd
import json

idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()









    Out[3]:






  
    
      
      symbol
      locus_name
      acc_number
      swiss-prot
      sgd
      sequence_length
      3d
      chromosome
    
  
  
    
      0
      AAC1
      YMR056C
      P04710
      ADT1_YEAST
      S000004660
      309
      13
      NaN
    
    
      1
      AAC3
      YBR085W
      P18238
      ADT3_YEAST
      S000000289
      307
      (3)
      2
    
    
      2
      AAD10
      YJR155W
      P47182
      AAD10_YEAST
      S000003916
      288
      10
      NaN
    
    
      3
      AAD14
      YNL331C
      P42884
      AAD14_YEAST
      S000005275
      376
      14
      NaN
    
    
      4
      AAD15
      YOL165C
      Q08361
      AAD15_YEAST
      S000005525
      143
      15
      NaN



In [4]:

    
orf2symbol = {}
for row in idmap.itertuples():
    sym = row[1]
    orf = row[2]
    orf2symbol[orf] = sym



In [5]:

    
# Load GO full dag


treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()









    Out[5]:






  
    
      
      parent
      child
      type
      in_tree
    
  
  
    
      441932
      GO:0015031
      YHR083W
      gene
      NOT_TREE
    
    
      441933
      GO:1902582
      YHR083W
      gene
      NOT_TREE
    
    
      441934
      GO:1902580
      YHR083W
      gene
      NOT_TREE
    
    
      441935
      GO:0098799
      YHR083W
      gene
      NOT_TREE
    
    
      441936
      GO:0098798
      YHR083W
      gene
      NOT_TREE



In [6]:

    
# Filter  terms only.

term2genes = {}

filtered = []
for row in tree.itertuples():
    
    t = row[3]
    if t == 'gene':
        term = row[1]
        
        terms = []
        if term in term2genes.keys():
            terms = term2genes[term]
        
        terms.append(row[2])
        term2genes[term] = terms
        continue
        
    r = (row[1], row[2], row[4])
    filtered.append(r)

dag = pd.DataFrame(filtered, columns=['parent', 'child', 'in_tree'])


print(term2genes['GO:0070822'])
dag.head()









    



['YDL076C', 'YMR053C', 'YBR095C', 'YOL004W', 'YPR023C', 'YPL181W', 'YKL185W', 'YMR263W', 'YMR075W', 'YAL013W', 'YNL097C', 'YDR207C', 'YIL084C', 'YNL330C', 'YNL309W', 'YPL139C']






    Out[6]:






  
    
      
      parent
      child
      in_tree
    
  
  
    
      0
      GO:0046434
      GO:0009395
      TREE
    
    
      1
      GO:0046434
      GO:0046855
      NOT_TREE
    
    
      2
      GO:0046434
      GO:0046081
      NOT_TREE
    
    
      3
      GO:0046434
      GO:0035863
      NOT_TREE
    
    
      4
      GO:0046434
      GO:0046168
      NOT_TREE



In [7]:

    
import networkx as nx

godag = nx.from_pandas_dataframe(dag, 'child', 'parent', ['in_tree'], create_using=nx.DiGraph())
godag.edges(data=True)[0]
godag.get_edge_data('GO:0019203', 'GO:0016791')









    Out[7]:





{'in_tree': 'TREE'}



In [8]:

    
subnet = {
    'data': {
        'name': 'subnet1'
    },
    'elements': {
        'nodes': [],
        'edges': []
    }
}

def get_node(id):
    name = ''
    n_type = 'term'
    
    if id == 'GO:00SUPER':
        name = "Root"
    elif id.startswith('Y') or  id.startswith('Q'):
        if id in orf2symbol.keys():
            name = orf2symbol[id].split(';')[0]
        else:
            name = id
        n_type = 'gene'
    else:
        name = obo[id].name
    
#     if id == start:
#         n_type = 'origin'
    
    node = {
        'data': {
            'id': id,
            'name': name,
            'type': n_type 
        }
    }  
    return node

def get_edge(source, target, in_tree):
    edge = {
        'data': {
            'source': source,
            'target': target,
            'in_tree': in_tree
        }
    }
    
    return edge



In [9]:

    
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])









    



load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-01-13) 47,943 GO Terms



In [12]:

    
def create_dag(paths, start_terms, goal):
    
    net = {
        'data': {
            'name': 'Path to ' + goal
        },
        
        'elements': {
            'nodes': [],
            'edges': []
        }
    }
    
    nodes = set()
    echeck = set()
    edges = []

    for p in paths:
        p_len = len(p)
        for i, node in enumerate(p):
            if i > p_len-2:
                break
            nodes.add(node)
            nodes.add(p[i+1])
            key = node + p[i+1]
            if key in echeck:
                continue

#             if p[i+1] == goal:
#                 continue
            
            echeck.add(key)

            in_tree = godag.get_edge_data(node, p[i+1])['in_tree']
            edge = get_edge(node, p[i+1], in_tree)
            edges.append(edge)

    cynodes = []
    
    # add genes
    for start in start_terms:
        genes = term2genes[start]
        for g in genes:
            edge = get_edge(g, start, 'gene')
            edges.append(edge)
        for n in genes:
            cynodes.append(get_node(n))

    # Add nodes
    for n in nodes:
        cynodes.append(get_node(n))

   
    net['elements']['nodes'] = cynodes
    net['elements']['edges'] = edges
    
    print(nodes)
    return net



In [10]:

    
# paths1 = nx.shortest_simple_paths(godag, source='GO:0070822', target='GO:00SUPER')
# paths2 = nx.shortest_simple_paths(godag, source='GO:0000812', target='GO:00SUPER')

goal = 'GO:00SUPER'
terms = ['GO:0070822', 'GO:0000812', 'GO:0070143']

for term in terms:
    paths = nx.shortest_simple_paths(godag, source=term, target=goal)
    with open('./data/' + term.replace(':', '') + '.cyjs', 'w') as outfile:
        json.dump(create_dag(paths, term, goal), outfile)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-731ccd23f4ca> in <module>()
      8     paths = nx.shortest_simple_paths(godag, source=term, target=goal)
      9     with open('./data/' + term.replace(':', '') + '.cyjs', 'w') as outfile:
---> 10         json.dump(create_dag(paths, term, goal), outfile)

NameError: name 'create_dag' is not defined

For figure 3:

Ex 1

For the first "And example", the parent term is: nuclear chromatin GO:0000790

The two children are: Sin3-type complex GO:0070822 Swr1 complex GO:0000812

You can do type command "grep GO:0000790 precollapse.propagated.2016" to verify in Mike's file that GO:0070822 and GO:0000812 are children of GO:0000790.

Ex 2

For the second "Or example", the parent term is: mitochondrial respiratory chain GO:0005746

The two children are: mitochondrial respiratory chain complex III GO:0005750 mitochondrial respiratory chain complex IV GO:0005751

The third example is "GFP-GI" is: response to endoplasmic reticulum stress GO:0034976



In [13]:

    
# Test sub-routes
goal1 = 'GO:0000790'
goal2 = 'GO:0005746'

terms1 = ['GO:0070822', 'GO:0000812']
terms2 = ['GO:0005750', 'GO:0005751']

all_paths = []
for term in terms1:
    plist = nx.shortest_simple_paths(godag, source=term, target=goal1)
    for p in plist:
        all_paths.append(p)

with open('./data/' + goal1.replace(':', '') + '.cyjs', 'w') as outfile:
    json.dump(create_dag(all_paths, terms1, 'N/A'), outfile)









    



{'GO:0070822', 'GO:0000790', 'GO:0000812'}



In [16]:

    
paths = nx.all_shortest_paths(godag, source='GO:0000790', target='GO:00SUPER')
print(list(paths))









    



[['GO:0000790', 'GO:0000785', 'GO:0032991', 'GO:0005575', 'GO:00SUPER']]

	symbol	locus_name	acc_number	swiss-prot	sgd	sequence_length	3d	chromosome
0	AAC1	YMR056C	P04710	ADT1_YEAST	S000004660	309	13	NaN
1	AAC3	YBR085W	P18238	ADT3_YEAST	S000000289	307	(3)	2
2	AAD10	YJR155W	P47182	AAD10_YEAST	S000003916	288	10	NaN
3	AAD14	YNL331C	P42884	AAD14_YEAST	S000005275	376	14	NaN
4	AAD15	YOL165C	Q08361	AAD15_YEAST	S000005525	143	15	NaN

	parent	child	type	in_tree
441932	GO:0015031	YHR083W	gene	NOT_TREE
441933	GO:1902582	YHR083W	gene	NOT_TREE
441934	GO:1902580	YHR083W	gene	NOT_TREE
441935	GO:0098799	YHR083W	gene	NOT_TREE
441936	GO:0098798	YHR083W	gene	NOT_TREE

	parent	child	in_tree
0	GO:0046434	GO:0009395	TREE
1	GO:0046434	GO:0046855	NOT_TREE
2	GO:0046434	GO:0046081	NOT_TREE
3	GO:0046434	GO:0035863	NOT_TREE
4	GO:0046434	GO:0046168	NOT_TREE