Subnetwork Generator for GO


In [3]:
import pandas as pd
import json

idmap = pd.read_csv('./yeast_clean4.txt', delimiter='\t')
idmap.head()


Out[3]:
symbol locus_name acc_number swiss-prot sgd sequence_length 3d chromosome
0 AAC1 YMR056C P04710 ADT1_YEAST S000004660 309 13 NaN
1 AAC3 YBR085W P18238 ADT3_YEAST S000000289 307 (3) 2
2 AAD10 YJR155W P47182 AAD10_YEAST S000003916 288 10 NaN
3 AAD14 YNL331C P42884 AAD14_YEAST S000005275 376 14 NaN
4 AAD15 YOL165C Q08361 AAD15_YEAST S000005525 143 15 NaN

In [4]:
orf2symbol = {}
for row in idmap.itertuples():
    sym = row[1]
    orf = row[2]
    orf2symbol[orf] = sym

In [5]:
# Load GO full dag


treeSourceUrl = 'http://chianti.ucsd.edu/~kono/ci/data/collapsed_go.no_IGI.propagated.small_parent_tree'

# Load the tree data
treeColNames = ['parent', 'child', 'type', 'in_tree']
tree = pd.read_csv(treeSourceUrl, delimiter='\t', names=treeColNames)
tree.tail()


Out[5]:
parent child type in_tree
441932 GO:0015031 YHR083W gene NOT_TREE
441933 GO:1902582 YHR083W gene NOT_TREE
441934 GO:1902580 YHR083W gene NOT_TREE
441935 GO:0098799 YHR083W gene NOT_TREE
441936 GO:0098798 YHR083W gene NOT_TREE

In [6]:
# Filter  terms only.

term2genes = {}

filtered = []
for row in tree.itertuples():
    
    t = row[3]
    if t == 'gene':
        term = row[1]
        
        terms = []
        if term in term2genes.keys():
            terms = term2genes[term]
        
        terms.append(row[2])
        term2genes[term] = terms
        continue
        
    r = (row[1], row[2], row[4])
    filtered.append(r)

dag = pd.DataFrame(filtered, columns=['parent', 'child', 'in_tree'])


print(term2genes['GO:0070822'])
dag.head()


['YDL076C', 'YMR053C', 'YBR095C', 'YOL004W', 'YPR023C', 'YPL181W', 'YKL185W', 'YMR263W', 'YMR075W', 'YAL013W', 'YNL097C', 'YDR207C', 'YIL084C', 'YNL330C', 'YNL309W', 'YPL139C']
Out[6]:
parent child in_tree
0 GO:0046434 GO:0009395 TREE
1 GO:0046434 GO:0046855 NOT_TREE
2 GO:0046434 GO:0046081 NOT_TREE
3 GO:0046434 GO:0035863 NOT_TREE
4 GO:0046434 GO:0046168 NOT_TREE

In [7]:
import networkx as nx

godag = nx.from_pandas_dataframe(dag, 'child', 'parent', ['in_tree'], create_using=nx.DiGraph())
godag.edges(data=True)[0]
godag.get_edge_data('GO:0019203', 'GO:0016791')


Out[7]:
{'in_tree': 'TREE'}

In [8]:
subnet = {
    'data': {
        'name': 'subnet1'
    },
    'elements': {
        'nodes': [],
        'edges': []
    }
}

def get_node(id):
    name = ''
    n_type = 'term'
    
    if id == 'GO:00SUPER':
        name = "Root"
    elif id.startswith('Y') or  id.startswith('Q'):
        if id in orf2symbol.keys():
            name = orf2symbol[id].split(';')[0]
        else:
            name = id
        n_type = 'gene'
    else:
        name = obo[id].name
    
#     if id == start:
#         n_type = 'origin'
    
    node = {
        'data': {
            'id': id,
            'name': name,
            'type': n_type 
        }
    }  
    return node

def get_edge(source, target, in_tree):
    edge = {
        'data': {
            'source': source,
            'target': target,
            'in_tree': in_tree
        }
    }
    
    return edge

In [9]:
from goatools import obo_parser
oboUrl = './data/go.obo'
obo = obo_parser.GODag(oboUrl, optional_attrs=['def'])


load obo file ./data/go.obo
./data/go.obo: fmt(1.2) rel(2017-01-13) 47,943 GO Terms

In [12]:
def create_dag(paths, start_terms, goal):
    
    net = {
        'data': {
            'name': 'Path to ' + goal
        },
        
        'elements': {
            'nodes': [],
            'edges': []
        }
    }
    
    nodes = set()
    echeck = set()
    edges = []

    for p in paths:
        p_len = len(p)
        for i, node in enumerate(p):
            if i > p_len-2:
                break
            nodes.add(node)
            nodes.add(p[i+1])
            key = node + p[i+1]
            if key in echeck:
                continue

#             if p[i+1] == goal:
#                 continue
            
            echeck.add(key)

            in_tree = godag.get_edge_data(node, p[i+1])['in_tree']
            edge = get_edge(node, p[i+1], in_tree)
            edges.append(edge)

    cynodes = []
    
    # add genes
    for start in start_terms:
        genes = term2genes[start]
        for g in genes:
            edge = get_edge(g, start, 'gene')
            edges.append(edge)
        for n in genes:
            cynodes.append(get_node(n))

    # Add nodes
    for n in nodes:
        cynodes.append(get_node(n))

   
    net['elements']['nodes'] = cynodes
    net['elements']['edges'] = edges
    
    print(nodes)
    return net

In [10]:
# paths1 = nx.shortest_simple_paths(godag, source='GO:0070822', target='GO:00SUPER')
# paths2 = nx.shortest_simple_paths(godag, source='GO:0000812', target='GO:00SUPER')

goal = 'GO:00SUPER'
terms = ['GO:0070822', 'GO:0000812', 'GO:0070143']

for term in terms:
    paths = nx.shortest_simple_paths(godag, source=term, target=goal)
    with open('./data/' + term.replace(':', '') + '.cyjs', 'w') as outfile:
        json.dump(create_dag(paths, term, goal), outfile)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-731ccd23f4ca> in <module>()
      8     paths = nx.shortest_simple_paths(godag, source=term, target=goal)
      9     with open('./data/' + term.replace(':', '') + '.cyjs', 'w') as outfile:
---> 10         json.dump(create_dag(paths, term, goal), outfile)

NameError: name 'create_dag' is not defined

For figure 3:

Ex 1

For the first "And example", the parent term is: nuclear chromatin GO:0000790

The two children are: Sin3-type complex GO:0070822 Swr1 complex GO:0000812

You can do type command "grep GO:0000790 precollapse.propagated.2016" to verify in Mike's file that GO:0070822 and GO:0000812 are children of GO:0000790.

Ex 2

For the second "Or example", the parent term is: mitochondrial respiratory chain GO:0005746

The two children are: mitochondrial respiratory chain complex III GO:0005750 mitochondrial respiratory chain complex IV GO:0005751

The third example is "GFP-GI" is: response to endoplasmic reticulum stress GO:0034976


In [13]:
# Test sub-routes
goal1 = 'GO:0000790'
goal2 = 'GO:0005746'

terms1 = ['GO:0070822', 'GO:0000812']
terms2 = ['GO:0005750', 'GO:0005751']

all_paths = []
for term in terms1:
    plist = nx.shortest_simple_paths(godag, source=term, target=goal1)
    for p in plist:
        all_paths.append(p)

with open('./data/' + goal1.replace(':', '') + '.cyjs', 'w') as outfile:
    json.dump(create_dag(all_paths, terms1, 'N/A'), outfile)


{'GO:0070822', 'GO:0000790', 'GO:0000812'}

In [16]:
paths = nx.all_shortest_paths(godag, source='GO:0000790', target='GO:00SUPER')
print(list(paths))


[['GO:0000790', 'GO:0000785', 'GO:0032991', 'GO:0005575', 'GO:00SUPER']]